111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 48c3ff71bSJunchao Zhang #include <petscsystypes.h> 58c3ff71bSJunchao Zhang #include <petscerror.h> 68c3ff71bSJunchao Zhang 78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 8f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 6186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 7986a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 842328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8586a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8686a27549SJunchao Zhang PetscFunctionReturn(0); 8786a27549SJunchao Zhang } 8886a27549SJunchao Zhang 89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90f0cf5187SStefano Zampini { 91f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92f0cf5187SStefano Zampini 93f0cf5187SStefano Zampini PetscFunctionBegin; 94f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9686a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 99f0cf5187SStefano Zampini PetscFunctionReturn(0); 100f0cf5187SStefano Zampini } 101f0cf5187SStefano Zampini 102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103f0cf5187SStefano Zampini { 104076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105f0cf5187SStefano Zampini 106f0cf5187SStefano Zampini PetscFunctionBegin; 107076ba34aSJunchao Zhang if (aijkok) { 108076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 109076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 110076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 111076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112076ba34aSJunchao Zhang } 113076ba34aSJunchao Zhang PetscFunctionReturn(0); 114076ba34aSJunchao Zhang } 115076ba34aSJunchao Zhang 116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117076ba34aSJunchao Zhang { 118076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119076ba34aSJunchao Zhang 120076ba34aSJunchao Zhang PetscFunctionBegin; 121076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 122076ba34aSJunchao Zhang PetscFunctionReturn(0); 123076ba34aSJunchao Zhang } 124076ba34aSJunchao Zhang 125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126076ba34aSJunchao Zhang { 127076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128076ba34aSJunchao Zhang 129076ba34aSJunchao Zhang PetscFunctionBegin; 1302328674fSJunchao Zhang if (aijkok) { 131076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 132076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1332328674fSJunchao Zhang } else { 1342328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1352328674fSJunchao Zhang } 136076ba34aSJunchao Zhang PetscFunctionReturn(0); 137076ba34aSJunchao Zhang } 138076ba34aSJunchao Zhang 139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140076ba34aSJunchao Zhang { 141076ba34aSJunchao Zhang PetscFunctionBegin; 142076ba34aSJunchao Zhang *array = NULL; 143076ba34aSJunchao Zhang PetscFunctionReturn(0); 144076ba34aSJunchao Zhang } 145076ba34aSJunchao Zhang 146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147076ba34aSJunchao Zhang { 148076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang PetscFunctionBegin; 1512328674fSJunchao Zhang if (aijkok) { 152076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1532328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1542328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1552328674fSJunchao Zhang } 156076ba34aSJunchao Zhang PetscFunctionReturn(0); 157076ba34aSJunchao Zhang } 158076ba34aSJunchao Zhang 159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160076ba34aSJunchao Zhang { 161076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162076ba34aSJunchao Zhang 163076ba34aSJunchao Zhang PetscFunctionBegin; 1642328674fSJunchao Zhang if (aijkok) { 165076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 166076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1672328674fSJunchao Zhang } 168f0cf5187SStefano Zampini PetscFunctionReturn(0); 169f0cf5187SStefano Zampini } 170f0cf5187SStefano Zampini 171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173a587d139SMark { 174042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176a587d139SMark 177a587d139SMark PetscFunctionBegin; 178076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181a587d139SMark PetscFunctionReturn(0); 182a587d139SMark } 183a587d139SMark 184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186a587d139SMark { 187042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188a587d139SMark 189a587d139SMark PetscFunctionBegin; 190a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 191a587d139SMark *d_mat = aijkok->device_mat_d.data(); 192a587d139SMark } else { 193a587d139SMark PetscErrorCode ierr; 194a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195a587d139SMark *d_mat = NULL; 196a587d139SMark } 197a587d139SMark PetscFunctionReturn(0); 198a587d139SMark } 199076ba34aSJunchao Zhang 200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202152b3e56SJunchao Zhang { 203152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204152b3e56SJunchao Zhang 205152b3e56SJunchao Zhang PetscFunctionBegin; 206076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 210076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 213076ba34aSJunchao Zhang } 214076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 215152b3e56SJunchao Zhang PetscFunctionReturn(0); 216152b3e56SJunchao Zhang } 217152b3e56SJunchao Zhang 218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220152b3e56SJunchao Zhang { 221152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222152b3e56SJunchao Zhang 223152b3e56SJunchao Zhang PetscFunctionBegin; 224076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 226076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 227076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 228076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 229076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 230076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 231076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 232076ba34aSJunchao Zhang #endif 23386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 234076ba34aSJunchao Zhang } 235076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 236152b3e56SJunchao Zhang PetscFunctionReturn(0); 237152b3e56SJunchao Zhang } 238a587d139SMark 2398c3ff71bSJunchao Zhang /* y = A x */ 2408c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2418c3ff71bSJunchao Zhang { 2428c3ff71bSJunchao Zhang PetscErrorCode ierr; 2438c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 244152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 245152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2468c3ff71bSJunchao Zhang 2478c3ff71bSJunchao Zhang PetscFunctionBegin; 2488c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 249152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 250152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2518c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 252152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 253152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 254152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 255bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 256076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 257152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2588c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2598c3ff71bSJunchao Zhang } 2608c3ff71bSJunchao Zhang 2618c3ff71bSJunchao Zhang /* y = A^T x */ 2628c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2638c3ff71bSJunchao Zhang { 2648c3ff71bSJunchao Zhang PetscErrorCode ierr; 2658c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 266152b3e56SJunchao Zhang const char *mode; 267152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 268152b3e56SJunchao Zhang PetscScalarKokkosView yv; 269076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2708c3ff71bSJunchao Zhang 2718c3ff71bSJunchao Zhang PetscFunctionBegin; 2728c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 273152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 274152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 275152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 276076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 277152b3e56SJunchao Zhang mode = "N"; 278152b3e56SJunchao Zhang } else { 279076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 280076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 281152b3e56SJunchao Zhang mode = "T"; 282152b3e56SJunchao Zhang } 283076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 284152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 285152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 286bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 287076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 2888c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2898c3ff71bSJunchao Zhang } 2908c3ff71bSJunchao Zhang 2918c3ff71bSJunchao Zhang /* y = A^H x */ 2928c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2938c3ff71bSJunchao Zhang { 2948c3ff71bSJunchao Zhang PetscErrorCode ierr; 2958c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 296152b3e56SJunchao Zhang const char *mode; 297152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 298152b3e56SJunchao Zhang PetscScalarKokkosView yv; 299076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3008c3ff71bSJunchao Zhang 3018c3ff71bSJunchao Zhang PetscFunctionBegin; 3028c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 303152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 304152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 305152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 306076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 307152b3e56SJunchao Zhang mode = "N"; 308152b3e56SJunchao Zhang } else { 309076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 310076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 311152b3e56SJunchao Zhang mode = "C"; 312152b3e56SJunchao Zhang } 313076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 314152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 315152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 316bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 317076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3188c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3198c3ff71bSJunchao Zhang } 3208c3ff71bSJunchao Zhang 3218c3ff71bSJunchao Zhang /* z = A x + y */ 3228c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3238c3ff71bSJunchao Zhang { 3248c3ff71bSJunchao Zhang PetscErrorCode ierr; 3258c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 326152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 327152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3288c3ff71bSJunchao Zhang 3298c3ff71bSJunchao Zhang PetscFunctionBegin; 3308c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 331152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 332152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 333152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3348c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3358c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 336152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 337152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 340bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 341152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3428c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3438c3ff71bSJunchao Zhang } 3448c3ff71bSJunchao Zhang 3458c3ff71bSJunchao Zhang /* z = A^T x + y */ 3468c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3478c3ff71bSJunchao Zhang { 3488c3ff71bSJunchao Zhang PetscErrorCode ierr; 3498c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 350152b3e56SJunchao Zhang const char *mode; 351152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 352152b3e56SJunchao Zhang PetscScalarKokkosView zv; 353076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3548c3ff71bSJunchao Zhang 3558c3ff71bSJunchao Zhang PetscFunctionBegin; 3568c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 357152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 358152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 359152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3608c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 361152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 362076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 363152b3e56SJunchao Zhang mode = "N"; 364152b3e56SJunchao Zhang } else { 365076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 366076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 367152b3e56SJunchao Zhang mode = "T"; 368152b3e56SJunchao Zhang } 369076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 370152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 371152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 372152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 373bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 374076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3758c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3768c3ff71bSJunchao Zhang } 3778c3ff71bSJunchao Zhang 3788c3ff71bSJunchao Zhang /* z = A^H x + y */ 3798c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3808c3ff71bSJunchao Zhang { 3818c3ff71bSJunchao Zhang PetscErrorCode ierr; 3828c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 383152b3e56SJunchao Zhang const char *mode; 384152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 385152b3e56SJunchao Zhang PetscScalarKokkosView zv; 386076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3878c3ff71bSJunchao Zhang 3888c3ff71bSJunchao Zhang PetscFunctionBegin; 3898c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 390152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 391152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 392152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3938c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 394152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 395076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 396152b3e56SJunchao Zhang mode = "N"; 397152b3e56SJunchao Zhang } else { 398076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 399076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 400152b3e56SJunchao Zhang mode = "C"; 401152b3e56SJunchao Zhang } 402076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 403152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 404152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 405152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 406bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 407076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 408152b3e56SJunchao Zhang PetscFunctionReturn(0); 409152b3e56SJunchao Zhang } 410152b3e56SJunchao Zhang 411152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 412152b3e56SJunchao Zhang { 413152b3e56SJunchao Zhang PetscErrorCode ierr; 414152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 415152b3e56SJunchao Zhang 416152b3e56SJunchao Zhang PetscFunctionBegin; 417152b3e56SJunchao Zhang switch (op) { 418152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 419152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 420152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 421152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 422152b3e56SJunchao Zhang break; 423152b3e56SJunchao Zhang default: 424152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 425152b3e56SJunchao Zhang break; 426152b3e56SJunchao Zhang } 4278c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4288c3ff71bSJunchao Zhang } 4298c3ff71bSJunchao Zhang 430076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4313d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4328c3ff71bSJunchao Zhang { 4338c3ff71bSJunchao Zhang PetscErrorCode ierr; 434076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4358c3ff71bSJunchao Zhang 4368c3ff71bSJunchao Zhang PetscFunctionBegin; 437a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 438076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 439076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4408c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 441076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 442076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 443076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 444076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 445076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 446076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 447076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 448076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 449076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 450076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 451076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4528c3ff71bSJunchao Zhang } 453076ba34aSJunchao Zhang } 4548c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4558c3ff71bSJunchao Zhang } 4568c3ff71bSJunchao Zhang 457076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 458076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 459076ba34aSJunchao Zhang */ 460076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4618c3ff71bSJunchao Zhang { 4628c3ff71bSJunchao Zhang PetscErrorCode ierr; 463076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 464076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 465076ba34aSJunchao Zhang Mat mat; 4668c3ff71bSJunchao Zhang 4678c3ff71bSJunchao Zhang PetscFunctionBegin; 468076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 469076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 470076ba34aSJunchao Zhang mat = *B; 471076ba34aSJunchao Zhang if (A->assembled) { 472076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 473076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 474076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 475076ba34aSJunchao Zhang /* Now copy values to B if needed */ 476076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 477076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 478076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 479076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 480076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 481076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 482076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 483076ba34aSJunchao Zhang } 484076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 485076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 486076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 487076ba34aSJunchao Zhang } 488076ba34aSJunchao Zhang mat->spptr = bkok; 489076ba34aSJunchao Zhang } 490076ba34aSJunchao Zhang 491076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 492076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 493076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 494076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 4958c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4968c3ff71bSJunchao Zhang } 4978c3ff71bSJunchao Zhang 4980ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 4990ecb592aSJunchao Zhang { 5000ecb592aSJunchao Zhang PetscErrorCode ierr; 5010ecb592aSJunchao Zhang Mat At; 5020ecb592aSJunchao Zhang KokkosCsrMatrix *internT,*csrmatT; 5030ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5040ecb592aSJunchao Zhang 5050ecb592aSJunchao Zhang PetscFunctionBegin; 5060ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5070ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 5080ecb592aSJunchao Zhang CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 5090ecb592aSJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 5100ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5110ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5127a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5130ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5140ecb592aSJunchao Zhang if ((*B)->assembled) { 5150ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5160ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5170ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5180ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5190ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5200ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5210ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5220ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5230ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5240ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5250ecb592aSJunchao Zhang } 5260ecb592aSJunchao Zhang PetscFunctionReturn(0); 5270ecb592aSJunchao Zhang } 5280ecb592aSJunchao Zhang 5298c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5308c3ff71bSJunchao Zhang { 5318c3ff71bSJunchao Zhang PetscErrorCode ierr; 53286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5338c3ff71bSJunchao Zhang 5348c3ff71bSJunchao Zhang PetscFunctionBegin; 53586a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 53686a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5378f7e8f9dSMark Adams if (aijkok) { 5388f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 539a587d139SMark delete aijkok->colmap_d; 540a587d139SMark delete aijkok->i_uncompressed_d; 541a587d139SMark } 5428f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5438f7e8f9dSMark Adams } 5448c3ff71bSJunchao Zhang delete aijkok; 54586a27549SJunchao Zhang } else { 54686a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 54786a27549SJunchao Zhang } 548152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 549152b3e56SJunchao Zhang A->spptr = NULL; 5508c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5528c3ff71bSJunchao Zhang } 5538c3ff71bSJunchao Zhang 55486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 55586a27549SJunchao Zhang { 55686a27549SJunchao Zhang PetscErrorCode ierr; 55786a27549SJunchao Zhang 55886a27549SJunchao Zhang PetscFunctionBegin; 55986a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 56086a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 56186a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 56286a27549SJunchao Zhang PetscFunctionReturn(0); 56386a27549SJunchao Zhang } 56486a27549SJunchao Zhang 565076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 566076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 567a3f881fbSStefano Zampini { 568076ba34aSJunchao Zhang PetscErrorCode ierr; 569076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 570076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 571076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 572076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 573076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 574076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 575a3f881fbSStefano Zampini 576a3f881fbSStefano Zampini PetscFunctionBegin; 577076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 578076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 579076ba34aSJunchao Zhang PetscValidPointer(C,4); 580076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 581076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 582c0aa6a63SJacob Faibussowitsch if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 583076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 584076ba34aSJunchao Zhang 585076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 586076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 587076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 588076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 589076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 590076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 591076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 592076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 593076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 594076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 595076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 596076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 597076ba34aSJunchao Zhang nnz = a->nz + b->nz; 598076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 599076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 600076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 601076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 602076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 603076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 604076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 605076ba34aSJunchao Zhang ca = ca_dual.view_device(); 606076ba34aSJunchao Zhang ci = ci_dual.view_device(); 607076ba34aSJunchao Zhang cj = cj_dual.view_device(); 608076ba34aSJunchao Zhang 609076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 610076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 611076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 612076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 613076ba34aSJunchao Zhang 614076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 615076ba34aSJunchao Zhang ci(i) = coffset; 616076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 617076ba34aSJunchao Zhang }); 618076ba34aSJunchao Zhang 619076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 620076ba34aSJunchao Zhang if (k < alen) { 621076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 622076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 623076ba34aSJunchao Zhang } else { 624076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 625076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 626076ba34aSJunchao Zhang } 627076ba34aSJunchao Zhang }); 628076ba34aSJunchao Zhang }); 629076ba34aSJunchao Zhang ca_dual.modify_device(); 630076ba34aSJunchao Zhang ci_dual.modify_device(); 631076ba34aSJunchao Zhang cj_dual.modify_device(); 632076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 633076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 634076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 635076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 636076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 637076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 638076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 639076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 640076ba34aSJunchao Zhang 641076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 642076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 643076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 644076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 645076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 646076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 647076ba34aSJunchao Zhang }); 648076ba34aSJunchao Zhang }); 649076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 650076ba34aSJunchao Zhang } 651076ba34aSJunchao Zhang PetscFunctionReturn(0); 652076ba34aSJunchao Zhang } 653076ba34aSJunchao Zhang 654076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 655076ba34aSJunchao Zhang { 656076ba34aSJunchao Zhang PetscFunctionBegin; 657076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 658a3f881fbSStefano Zampini PetscFunctionReturn(0); 659a3f881fbSStefano Zampini } 660a3f881fbSStefano Zampini 661a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 662a3f881fbSStefano Zampini { 663076ba34aSJunchao Zhang PetscErrorCode ierr; 664a3f881fbSStefano Zampini Mat_Product *product = C->product; 665a3f881fbSStefano Zampini Mat A,B; 666076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 667a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 668a3f881fbSStefano Zampini Mat_SeqAIJ *c; 669076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 670076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 671a3f881fbSStefano Zampini 672a3f881fbSStefano Zampini PetscFunctionBegin; 673a3f881fbSStefano Zampini MatCheckProduct(C,1); 674a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 675076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 676076ba34aSJunchao Zhang 677076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 678076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 679076ba34aSJunchao Zhang PetscFunctionReturn(0); 680076ba34aSJunchao Zhang } 681076ba34aSJunchao Zhang 682076ba34aSJunchao Zhang switch (product->type) { 683076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 684076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 685076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 686076ba34aSJunchao Zhang default: 687076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 688076ba34aSJunchao Zhang } 689076ba34aSJunchao Zhang 690a3f881fbSStefano Zampini A = product->A; 691a3f881fbSStefano Zampini B = product->B; 692a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 693a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 694a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 695a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 696a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 697076ba34aSJunchao Zhang 698076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 699076ba34aSJunchao Zhang 700076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 701076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 702076ba34aSJunchao Zhang 703076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 704076ba34aSJunchao Zhang if (transA) { 705076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 706076ba34aSJunchao Zhang transA = false; 707a3f881fbSStefano Zampini } 708a3f881fbSStefano Zampini 709076ba34aSJunchao Zhang if (transB) { 710076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 711076ba34aSJunchao Zhang transB = false; 712076ba34aSJunchao Zhang } 713076ba34aSJunchao Zhang 714076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 715076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 716076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 717a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 718a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 719c0aa6a63SJacob Faibussowitsch ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 720a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 721c0aa6a63SJacob Faibussowitsch ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 722a3f881fbSStefano Zampini c->reallocs = 0; 723076ba34aSJunchao Zhang C->info.mallocs = 0; 724a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 725a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 726a3f881fbSStefano Zampini C->num_ass++; 727a3f881fbSStefano Zampini PetscFunctionReturn(0); 728a3f881fbSStefano Zampini } 729a3f881fbSStefano Zampini 730a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 731a3f881fbSStefano Zampini { 732a3f881fbSStefano Zampini PetscErrorCode ierr; 733076ba34aSJunchao Zhang Mat_Product *product = C->product; 734076ba34aSJunchao Zhang MatProductType ptype; 735076ba34aSJunchao Zhang Mat A,B; 736076ba34aSJunchao Zhang bool transA,transB; 737076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 738076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 739076ba34aSJunchao Zhang MPI_Comm comm; 740076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 741a3f881fbSStefano Zampini 742a3f881fbSStefano Zampini PetscFunctionBegin; 743a3f881fbSStefano Zampini MatCheckProduct(C,1); 744076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 745076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 746a3f881fbSStefano Zampini A = product->A; 747a3f881fbSStefano Zampini B = product->B; 748a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 749a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 750a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 751a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 752076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 753076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 754076ba34aSJunchao Zhang 755a3f881fbSStefano Zampini ptype = product->type; 756a3f881fbSStefano Zampini switch (ptype) { 757076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 758076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 759076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 760a3f881fbSStefano Zampini default: 761076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 762a3f881fbSStefano Zampini } 763a3f881fbSStefano Zampini 764076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 765076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 766076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 767076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 768a3f881fbSStefano Zampini 769076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 770076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 771*6ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 772*6ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 773*6ffb9508SJunchao Zhang */ 774076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 775076ba34aSJunchao Zhang 776076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 777076ba34aSJunchao Zhang if (transA) { 778076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 779076ba34aSJunchao Zhang transA = false; 780076ba34aSJunchao Zhang } 781076ba34aSJunchao Zhang 782076ba34aSJunchao Zhang if (transB) { 783076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 784076ba34aSJunchao Zhang transB = false; 785076ba34aSJunchao Zhang } 786076ba34aSJunchao Zhang 787076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 788076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 789076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 790076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 791076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 792076ba34aSJunchao Zhang */ 793076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 794076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 795076ba34aSJunchao Zhang 796076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 797076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 798076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 799a3f881fbSStefano Zampini PetscFunctionReturn(0); 800a3f881fbSStefano Zampini } 801a3f881fbSStefano Zampini 802a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 803076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 804a3f881fbSStefano Zampini { 805a3f881fbSStefano Zampini PetscErrorCode ierr; 806076ba34aSJunchao Zhang Mat_Product *product = mat->product; 807a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 808a3f881fbSStefano Zampini 809a3f881fbSStefano Zampini PetscFunctionBegin; 810a3f881fbSStefano Zampini MatCheckProduct(mat,1); 811a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 812a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 813a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 814a3f881fbSStefano Zampini } 815a3f881fbSStefano Zampini if (Biskok && Ciskok) { 816a3f881fbSStefano Zampini switch (product->type) { 817a3f881fbSStefano Zampini case MATPRODUCT_AB: 818a3f881fbSStefano Zampini case MATPRODUCT_AtB: 819a3f881fbSStefano Zampini case MATPRODUCT_ABt: 820a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 821a3f881fbSStefano Zampini break; 822a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 823a3f881fbSStefano Zampini case MATPRODUCT_RARt: 824a3f881fbSStefano Zampini case MATPRODUCT_ABC: 825a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 826a3f881fbSStefano Zampini break; 827a3f881fbSStefano Zampini default: 828a3f881fbSStefano Zampini break; 829a3f881fbSStefano Zampini } 830a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 831a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 832a3f881fbSStefano Zampini } 833a3f881fbSStefano Zampini PetscFunctionReturn(0); 834a3f881fbSStefano Zampini } 835a587d139SMark 836f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 837f0cf5187SStefano Zampini { 838f0cf5187SStefano Zampini PetscErrorCode ierr; 839f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 840f0cf5187SStefano Zampini 841f0cf5187SStefano Zampini PetscFunctionBegin; 842f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 843f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 844076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 845076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 846f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 847076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 848f0cf5187SStefano Zampini PetscFunctionReturn(0); 849f0cf5187SStefano Zampini } 850f0cf5187SStefano Zampini 851a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 852a587d139SMark { 853a587d139SMark PetscErrorCode ierr; 854076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 855a587d139SMark 856a587d139SMark PetscFunctionBegin; 857076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8582328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 859076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 860076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8612328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8622328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8632328674fSJunchao Zhang } 864a587d139SMark PetscFunctionReturn(0); 865a587d139SMark } 866a587d139SMark 867db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 868db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 869db78de30SJunchao Zhang { 870db78de30SJunchao Zhang PetscErrorCode ierr; 871db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 872db78de30SJunchao Zhang 873db78de30SJunchao Zhang PetscFunctionBegin; 874db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 875db78de30SJunchao Zhang PetscValidPointer(kv,2); 876db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 877db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 878db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 879076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 880db78de30SJunchao Zhang PetscFunctionReturn(0); 881db78de30SJunchao Zhang } 882db78de30SJunchao Zhang 883db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 884db78de30SJunchao Zhang { 885db78de30SJunchao Zhang PetscFunctionBegin; 886db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887db78de30SJunchao Zhang PetscValidPointer(kv,2); 888db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889db78de30SJunchao Zhang PetscFunctionReturn(0); 890db78de30SJunchao Zhang } 891db78de30SJunchao Zhang 892db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 893db78de30SJunchao Zhang { 894db78de30SJunchao Zhang PetscErrorCode ierr; 895db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 896db78de30SJunchao Zhang 897db78de30SJunchao Zhang PetscFunctionBegin; 898db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899db78de30SJunchao Zhang PetscValidPointer(kv,2); 900db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 902db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 903076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 904db78de30SJunchao Zhang PetscFunctionReturn(0); 905db78de30SJunchao Zhang } 906db78de30SJunchao Zhang 907db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 908db78de30SJunchao Zhang { 909db78de30SJunchao Zhang PetscErrorCode ierr; 910db78de30SJunchao Zhang 911db78de30SJunchao Zhang PetscFunctionBegin; 912db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 913db78de30SJunchao Zhang PetscValidPointer(kv,2); 914db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 915076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 916db78de30SJunchao Zhang PetscFunctionReturn(0); 917db78de30SJunchao Zhang } 918db78de30SJunchao Zhang 919db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 920db78de30SJunchao Zhang { 921db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 922db78de30SJunchao Zhang 923db78de30SJunchao Zhang PetscFunctionBegin; 924db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 925db78de30SJunchao Zhang PetscValidPointer(kv,2); 926db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 927db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 928076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 929db78de30SJunchao Zhang PetscFunctionReturn(0); 930db78de30SJunchao Zhang } 931db78de30SJunchao Zhang 932db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 933db78de30SJunchao Zhang { 934db78de30SJunchao Zhang PetscErrorCode ierr; 935db78de30SJunchao Zhang 936db78de30SJunchao Zhang PetscFunctionBegin; 937db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 938db78de30SJunchao Zhang PetscValidPointer(kv,2); 939db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 940076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 941db78de30SJunchao Zhang PetscFunctionReturn(0); 942db78de30SJunchao Zhang } 943db78de30SJunchao Zhang 944c17cf699SJunchao Zhang /* Computes Y += alpha X */ 945c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 946a587d139SMark { 947a587d139SMark PetscErrorCode ierr; 948a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 949c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 950c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 951c17cf699SJunchao Zhang MatScalarKokkosView Ya; 952a587d139SMark 953a587d139SMark PetscFunctionBegin; 954c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 955c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 956c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 957c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 958db78de30SJunchao Zhang 959c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 960c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 961a587d139SMark PetscBool e; 962a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 963a587d139SMark if (e) { 964a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 965c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 966a587d139SMark } 967a587d139SMark } 968db78de30SJunchao Zhang 969c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 970c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 971c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 972c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 973c17cf699SJunchao Zhang */ 974c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 975c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 976c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 977c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 978c17cf699SJunchao Zhang 979c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 980c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 981c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 982c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 983c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 984c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 985c17cf699SJunchao Zhang 986c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 987c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 988c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 989c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 990c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 991c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 992c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 993c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 994c17cf699SJunchao Zhang q++; 995a587d139SMark } else { 996c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 997c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 998c17cf699SJunchao Zhang */ 999c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1000a587d139SMark } 1001c17cf699SJunchao Zhang } 1002c17cf699SJunchao Zhang }); 1003c17cf699SJunchao Zhang }); 1004c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1005c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1006c17cf699SJunchao Zhang Mat Z; 1007c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1008c17cf699SJunchao Zhang KernelHandle kh; 1009c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1010c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1011c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1012c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1013c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1014c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1015c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1016c17cf699SJunchao Zhang } 1017c17cf699SJunchao Zhang 1018a587d139SMark PetscFunctionReturn(0); 1019a587d139SMark } 1020a587d139SMark 102186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10228f7e8f9dSMark Adams { 10238f7e8f9dSMark Adams PetscErrorCode ierr; 10248f7e8f9dSMark Adams 10258f7e8f9dSMark Adams PetscFunctionBegin; 10268f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 10278f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 10288f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10298f7e8f9dSMark Adams PetscFunctionReturn(0); 10308f7e8f9dSMark Adams } 10318f7e8f9dSMark Adams 10328c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10338c3ff71bSJunchao Zhang { 1034076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1035076ba34aSJunchao Zhang 10368c3ff71bSJunchao Zhang PetscFunctionBegin; 1037076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 10386f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 10396f3d89d0SStefano Zampini 10408c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10418c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10428c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1043a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1044f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1045a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1046076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 10478c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 10488c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 10498c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 10508c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 10518c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 10528c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1053076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 10540ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1055152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1056076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1057076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1058076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1059076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1060076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1061076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1062076ba34aSJunchao Zhang PetscFunctionReturn(0); 1063076ba34aSJunchao Zhang } 1064076ba34aSJunchao Zhang 1065076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1066076ba34aSJunchao Zhang { 1067076ba34aSJunchao Zhang PetscErrorCode ierr; 1068076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1069076ba34aSJunchao Zhang PetscInt i,m,n; 1070076ba34aSJunchao Zhang 1071076ba34aSJunchao Zhang PetscFunctionBegin; 1072076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1073076ba34aSJunchao Zhang 1074076ba34aSJunchao Zhang m = akok->nrows(); 1075076ba34aSJunchao Zhang n = akok->ncols(); 1076076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1077076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1078076ba34aSJunchao Zhang 1079076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1080076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1081076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1082076ba34aSJunchao Zhang 1083076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1084076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1085076ba34aSJunchao Zhang 1086076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1087076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1088076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1089076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1090076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1091076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1092076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1093076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1094076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1095076ba34aSJunchao Zhang 1096076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1097076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1098076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1099076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1100076ba34aSJunchao Zhang } 1101076ba34aSJunchao Zhang 1102076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1103076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1104076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1105076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1106076ba34aSJunchao Zhang A->spptr = akok; 1107076ba34aSJunchao Zhang PetscFunctionReturn(0); 1108076ba34aSJunchao Zhang } 1109076ba34aSJunchao Zhang 1110076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1111076ba34aSJunchao Zhang 1112076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1113076ba34aSJunchao Zhang */ 1114076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1115076ba34aSJunchao Zhang { 1116076ba34aSJunchao Zhang PetscErrorCode ierr; 1117076ba34aSJunchao Zhang 1118076ba34aSJunchao Zhang PetscFunctionBegin; 1119076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1120076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 11218c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11228c3ff71bSJunchao Zhang } 11238c3ff71bSJunchao Zhang 11248c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1125152b3e56SJunchao Zhang /*@C 11268c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11278c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11288c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11298c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 11308c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 11318c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 11328c3ff71bSJunchao Zhang 11338c3ff71bSJunchao Zhang Collective 11348c3ff71bSJunchao Zhang 11358c3ff71bSJunchao Zhang Input Parameters: 11368c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 11378c3ff71bSJunchao Zhang . m - number of rows 11388c3ff71bSJunchao Zhang . n - number of columns 11398c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 11408c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 11418c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11428c3ff71bSJunchao Zhang 11438c3ff71bSJunchao Zhang Output Parameter: 11448c3ff71bSJunchao Zhang . A - the matrix 11458c3ff71bSJunchao Zhang 11468c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 11478c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 11488c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 11498c3ff71bSJunchao Zhang 11508c3ff71bSJunchao Zhang Notes: 11518c3ff71bSJunchao Zhang If nnz is given then nz is ignored 11528c3ff71bSJunchao Zhang 11538c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 11548c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 11558c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 11568c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 11578c3ff71bSJunchao Zhang 11588c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 11598c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11608c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 11618c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 11628c3ff71bSJunchao Zhang 11638c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 11648c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 11658c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 11668c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 11678c3ff71bSJunchao Zhang 11688c3ff71bSJunchao Zhang Level: intermediate 11698c3ff71bSJunchao Zhang 1170076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 11718c3ff71bSJunchao Zhang @*/ 11728c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 11738c3ff71bSJunchao Zhang { 11748c3ff71bSJunchao Zhang PetscErrorCode ierr; 11758c3ff71bSJunchao Zhang 11768c3ff71bSJunchao Zhang PetscFunctionBegin; 11778c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 11788c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 11798c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 11808c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11818c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11828c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11838c3ff71bSJunchao Zhang } 1184930e68a5SMark Adams 11858f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 11868f7e8f9dSMark Adams // 11878f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 11888f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 11898f7e8f9dSMark Adams // 11908f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1191930e68a5SMark Adams { 11928f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 11938f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 11948f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1195930e68a5SMark Adams PetscErrorCode ierr; 11968f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1197076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1198076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1199076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12008f7e8f9dSMark Adams PetscBool row_identity,col_identity; 12018f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 12028f7e8f9dSMark Adams PetscContainer container; 1203930e68a5SMark Adams 1204930e68a5SMark Adams PetscFunctionBegin; 1205c0aa6a63SJacob Faibussowitsch if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12068f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 12078f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12088f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 12098f7e8f9dSMark Adams if (container) { 12108f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 12118f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 12128f7e8f9dSMark Adams Nf = (*pNf)%1000; 12138f7e8f9dSMark Adams nv = (*pNf)/1000; 12148f7e8f9dSMark Adams if (nv>0) nVec = nv; 12158f7e8f9dSMark Adams } else Nf = 1; 1216c0aa6a63SJacob Faibussowitsch if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf); 12178f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12188f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12198f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12208f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 12218f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12228f7e8f9dSMark Adams #endif 12238f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12248f7e8f9dSMark Adams { 12258f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12268f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12278f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12288f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12298f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12308f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12318f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12328f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12338f7e8f9dSMark Adams size_t flops_h = 0.0; 12348f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12358f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12368f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12378f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12388f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12398f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12408f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12418f7e8f9dSMark Adams // Fill A --> fact 12428f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1243042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12448f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12458f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12468f7e8f9dSMark Adams // zero rows of B 12478f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12488f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12498f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12508f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12518f7e8f9dSMark Adams /* zero (unfactored row) */ 12528f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12538f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12548f7e8f9dSMark Adams }); 12558f7e8f9dSMark Adams // copy A into B 12568f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12578f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12588f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12598f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12608f7e8f9dSMark Adams /* load in initial (unfactored row) */ 12618f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 12628f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 12638f7e8f9dSMark Adams PetscScalar vala = av[j]; 12648f7e8f9dSMark Adams if (colb == rowb) { 12658f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 12668f7e8f9dSMark Adams } else { 12678f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12688f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12698f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 12708f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 12718f7e8f9dSMark Adams if (pbj[j] == colb) { 12728f7e8f9dSMark Adams pba[j] = vala; 12738f7e8f9dSMark Adams set++; 12748f7e8f9dSMark Adams break; 12758f7e8f9dSMark Adams } 12768f7e8f9dSMark Adams } 12778f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 12788f7e8f9dSMark Adams } 12798f7e8f9dSMark Adams } 12808f7e8f9dSMark Adams }); 12818f7e8f9dSMark Adams }); 12828f7e8f9dSMark Adams Kokkos::fence(); 1283930e68a5SMark Adams 12848f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 12858f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12868f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12878f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1288042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 12898f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 12908f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 12918f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 12928f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 12938f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 12948f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 12958f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 12968f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 12978f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 12988f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 12998f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13008f7e8f9dSMark Adams else { 13018f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13028f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13038f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13048f7e8f9dSMark Adams size_t st_idx; 13058f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13068f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13078f7e8f9dSMark Adams // get column, there has got to be a better way 13088f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13098f7e8f9dSMark Adams if (pjL[j] == ii) { 13108f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13118f7e8f9dSMark Adams idx = j; // output 13128f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13138f7e8f9dSMark Adams } 13148f7e8f9dSMark Adams }, st_idx); 13158f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 131699551766SMark Adams #if defined(PETSC_USE_DEBUG) 131799551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 131899551766SMark Adams #endif 131999551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13208f7e8f9dSMark Adams // U(i+1,:end) 13218f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13228f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13238f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13248f7e8f9dSMark Adams if (col==myk) { 13258f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13268f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13278f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13288f7e8f9dSMark Adams } else { 13298f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13308f7e8f9dSMark Adams PetscInt high, low; 13318f7e8f9dSMark Adams const PetscInt *startj; 13328f7e8f9dSMark Adams if (col<myk) { // L 13338f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13348f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13358f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13368f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13378f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13388f7e8f9dSMark Adams } else { 13398f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 13408f7e8f9dSMark Adams start = ba_d + idx; 13418f7e8f9dSMark Adams startj= bj_d + idx; 13428f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13438f7e8f9dSMark Adams } 13448f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13458f7e8f9dSMark Adams low = 0; 13468f7e8f9dSMark Adams high = (PetscInt)(end-start); 13478f7e8f9dSMark Adams while (high-low > 5) { 13488f7e8f9dSMark Adams int t = (low+high)/2; 13498f7e8f9dSMark Adams if (startj[t] > col) high = t; 13508f7e8f9dSMark Adams else low = t; 13518f7e8f9dSMark Adams } 13528f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13538f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13548f7e8f9dSMark Adams } 135599551766SMark Adams #if defined(PETSC_USE_DEBUG) 135699551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 135799551766SMark Adams #endif 13588f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 13598f7e8f9dSMark Adams } 13608f7e8f9dSMark Adams }); 13618f7e8f9dSMark Adams } 13628f7e8f9dSMark Adams }); 13638f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 13648f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 13658f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 13668f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 13678f7e8f9dSMark Adams }); 13688f7e8f9dSMark Adams Kokkos::fence(); 13698f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 13708f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13718f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13728f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 13738f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13748f7e8f9dSMark Adams #endif 13758f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 13768f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 13778f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 13788f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 13798f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 13808f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 13818f7e8f9dSMark Adams if (i < end) { 13828f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 13838f7e8f9dSMark Adams *pv = 1.0/(*pv); 13848f7e8f9dSMark Adams } 13858f7e8f9dSMark Adams }); 13868f7e8f9dSMark Adams }); 13878f7e8f9dSMark Adams } 13888f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13898f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 13908f7e8f9dSMark Adams #endif 13918f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 13928f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 13938f7e8f9dSMark Adams 13948f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 13958f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 13968f7e8f9dSMark Adams if (b->inode.size) { 13978f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 13988f7e8f9dSMark Adams } else if (row_identity && col_identity) { 13998f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14008f7e8f9dSMark Adams } else { 14018f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14028f7e8f9dSMark Adams } 14038f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14048f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 14058f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14068f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14078f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14088f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14098f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14108f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14118f7e8f9dSMark Adams 1412930e68a5SMark Adams PetscFunctionReturn(0); 1413930e68a5SMark Adams } 1414930e68a5SMark Adams 141586a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1416930e68a5SMark Adams { 1417930e68a5SMark Adams PetscErrorCode ierr; 1418930e68a5SMark Adams 1419930e68a5SMark Adams PetscFunctionBegin; 1420930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 142186a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 142286a27549SJunchao Zhang PetscFunctionReturn(0); 142386a27549SJunchao Zhang } 142486a27549SJunchao Zhang 142586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 142686a27549SJunchao Zhang { 142786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 142886a27549SJunchao Zhang 142986a27549SJunchao Zhang PetscFunctionBegin; 143086a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 143186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 143286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 143386a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 143486a27549SJunchao Zhang } 143586a27549SJunchao Zhang PetscFunctionReturn(0); 143686a27549SJunchao Zhang } 143786a27549SJunchao Zhang 143886a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 143986a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 144086a27549SJunchao Zhang { 144186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1442076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 144386a27549SJunchao Zhang 144486a27549SJunchao Zhang PetscFunctionBegin; 144586a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 144686a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1447076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 144886a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1449076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1450076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 145186a27549SJunchao Zhang 145286a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1453076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1454076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1455076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 145686a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 145786a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 145886a27549SJunchao Zhang 145986a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 146086a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 146186a27549SJunchao Zhang */ 1462076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1463076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 146486a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 146586a27549SJunchao Zhang 146686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 146786a27549SJunchao Zhang 146886a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1469076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 147086a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1471076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1472076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 147386a27549SJunchao Zhang 147486a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1475076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1476076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1477076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 147886a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 147986a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 148086a27549SJunchao Zhang 148186a27549SJunchao Zhang /* Sort indices. See comments above */ 1482076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1483076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 148486a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 148586a27549SJunchao Zhang 148686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 148786a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 148886a27549SJunchao Zhang } 148986a27549SJunchao Zhang PetscFunctionReturn(0); 149086a27549SJunchao Zhang } 149186a27549SJunchao Zhang 149286a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 149386a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 149486a27549SJunchao Zhang { 149586a27549SJunchao Zhang PetscErrorCode ierr; 149686a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 149786a27549SJunchao Zhang PetscScalarKokkosView xv; 149886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 149986a27549SJunchao Zhang 150086a27549SJunchao Zhang PetscFunctionBegin; 150186a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 150286a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 150386a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 150486a27549SJunchao Zhang /* Solve L tmpv = b */ 15053f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 150686a27549SJunchao Zhang /* Solve Ux = tmpv */ 15073f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 150886a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 150986a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 151086a27549SJunchao Zhang PetscFunctionReturn(0); 151186a27549SJunchao Zhang } 151286a27549SJunchao Zhang 1513076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 151486a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 151586a27549SJunchao Zhang { 151686a27549SJunchao Zhang PetscErrorCode ierr; 151786a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 151886a27549SJunchao Zhang PetscScalarKokkosView xv; 151986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 152086a27549SJunchao Zhang 152186a27549SJunchao Zhang PetscFunctionBegin; 152286a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 152386a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 152486a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 152586a27549SJunchao Zhang /* Solve U^T tmpv = b */ 152686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 152786a27549SJunchao Zhang 152886a27549SJunchao Zhang /* Solve L^T x = tmpv */ 152986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 153086a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 153186a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 153286a27549SJunchao Zhang PetscFunctionReturn(0); 153386a27549SJunchao Zhang } 153486a27549SJunchao Zhang 153586a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 153686a27549SJunchao Zhang { 153786a27549SJunchao Zhang PetscErrorCode ierr; 153886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 153986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 154086a27549SJunchao Zhang PetscInt fill_lev = info->levels; 154186a27549SJunchao Zhang 154286a27549SJunchao Zhang PetscFunctionBegin; 154386a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1544076ba34aSJunchao Zhang 1545076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1546076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1547076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1548076ba34aSJunchao Zhang 1549076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 155086a27549SJunchao Zhang 155186a27549SJunchao Zhang B->assembled = PETSC_TRUE; 155286a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 155386a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 155486a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 155586a27549SJunchao Zhang B->ops->matsolve = NULL; 155686a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 155786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 155886a27549SJunchao Zhang 155986a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 156086a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 156186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 156286a27549SJunchao Zhang PetscFunctionReturn(0); 156386a27549SJunchao Zhang } 156486a27549SJunchao Zhang 156586a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 156686a27549SJunchao Zhang { 156786a27549SJunchao Zhang PetscErrorCode ierr; 156886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 156986a27549SJunchao Zhang Mat_SeqAIJ *b; 157086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 157186a27549SJunchao Zhang PetscInt fill_lev = info->levels; 157286a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 157386a27549SJunchao Zhang PetscInt n = A->rmap->n; 157486a27549SJunchao Zhang 157586a27549SJunchao Zhang PetscFunctionBegin; 157686a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 157786a27549SJunchao Zhang /* Rebuild factors */ 157886a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 157986a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 158086a27549SJunchao Zhang 158186a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 158286a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 158386a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 158486a27549SJunchao Zhang 158586a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 158686a27549SJunchao Zhang 158786a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 158886a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 158986a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 159086a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 159186a27549SJunchao Zhang 159286a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1593076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1594076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1595076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 159686a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 159786a27549SJunchao Zhang 159886a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 159986a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 160086a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 160186a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 160286a27549SJunchao Zhang 160386a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 160486a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 160586a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 160686a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 160786a27549SJunchao Zhang #else 160886a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 160986a27549SJunchao Zhang #endif 161086a27549SJunchao Zhang 161186a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 161286a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 161386a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 161486a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 161586a27549SJunchao Zhang 161686a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 161786a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 161886a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 161986a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 162086a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 162186a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 162286a27549SJunchao Zhang 162386a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162486a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1625930e68a5SMark Adams PetscFunctionReturn(0); 1626930e68a5SMark Adams } 1627930e68a5SMark Adams 16288f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16298f7e8f9dSMark Adams { 16308f7e8f9dSMark Adams PetscErrorCode ierr; 16318f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16328f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1633930e68a5SMark Adams 16348f7e8f9dSMark Adams PetscFunctionBegin; 16358f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 16368f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16378f7e8f9dSMark Adams // move B data into Kokkos 16388f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 16398f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 16408f7e8f9dSMark Adams { 16418f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 16428f7e8f9dSMark Adams if (!baijkok->diag_d) { 16438f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1644152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 16458f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 16468f7e8f9dSMark Adams } 16478f7e8f9dSMark Adams } 16488f7e8f9dSMark Adams PetscFunctionReturn(0); 16498f7e8f9dSMark Adams } 16508f7e8f9dSMark Adams 165186a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1652930e68a5SMark Adams { 1653930e68a5SMark Adams PetscFunctionBegin; 1654930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1655930e68a5SMark Adams PetscFunctionReturn(0); 1656930e68a5SMark Adams } 1657930e68a5SMark Adams 16588f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16598f7e8f9dSMark Adams { 16608f7e8f9dSMark Adams PetscFunctionBegin; 16618f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16628f7e8f9dSMark Adams PetscFunctionReturn(0); 16638f7e8f9dSMark Adams } 16648f7e8f9dSMark Adams 1665930e68a5SMark Adams /*MC 166686a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 166786a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1668930e68a5SMark Adams 1669930e68a5SMark Adams Level: beginner 1670930e68a5SMark Adams 167186a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1672930e68a5SMark Adams M*/ 167386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1674930e68a5SMark Adams { 1675930e68a5SMark Adams PetscErrorCode ierr; 1676930e68a5SMark Adams PetscInt n = A->rmap->n; 1677930e68a5SMark Adams 1678930e68a5SMark Adams PetscFunctionBegin; 1679930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1680930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1681930e68a5SMark Adams (*B)->factortype = ftype; 16824ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1683930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1684930e68a5SMark Adams 16858f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1686930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168786a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 168886a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 168986a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 169086a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 169186a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 169286a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 169386a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1694930e68a5SMark Adams 1695930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169686a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1697930e68a5SMark Adams PetscFunctionReturn(0); 1698930e68a5SMark Adams } 16998f7e8f9dSMark Adams 17008f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17018f7e8f9dSMark Adams { 17028f7e8f9dSMark Adams PetscErrorCode ierr; 17038f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17048f7e8f9dSMark Adams 17058f7e8f9dSMark Adams PetscFunctionBegin; 17068f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 17078f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 17088f7e8f9dSMark Adams (*B)->factortype = ftype; 1709f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17104ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 17118f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17128f7e8f9dSMark Adams 17138f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17148f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17158f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17168f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17178f7e8f9dSMark Adams 17188f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17198f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17208f7e8f9dSMark Adams PetscFunctionReturn(0); 17218f7e8f9dSMark Adams } 172286a27549SJunchao Zhang 172386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 172486a27549SJunchao Zhang { 172586a27549SJunchao Zhang PetscErrorCode ierr; 172686a27549SJunchao Zhang 172786a27549SJunchao Zhang PetscFunctionBegin; 172886a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 172986a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 173086a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 173186a27549SJunchao Zhang PetscFunctionReturn(0); 173286a27549SJunchao Zhang } 173386a27549SJunchao Zhang 1734076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1735076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1736076ba34aSJunchao Zhang { 1737076ba34aSJunchao Zhang PetscErrorCode ierr; 1738076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1739076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1740076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1741076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1742076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1743076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1744076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1745076ba34aSJunchao Zhang 1746076ba34aSJunchao Zhang PetscFunctionBegin; 1747c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1748076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1749c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1750076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1751c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1752076ba34aSJunchao Zhang } 1753076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1754076ba34aSJunchao Zhang } 1755076ba34aSJunchao Zhang PetscFunctionReturn(0); 1756076ba34aSJunchao Zhang } 1757