111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 28076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 29076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 308c3ff71bSJunchao Zhang 318c3ff71bSJunchao Zhang PetscFunctionBegin; 32076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode)); 34076ba34aSJunchao Zhang 35076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37076ba34aSJunchao Zhang 38076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 39076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 41076ba34aSJunchao Zhang */ 42076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43076ba34aSJunchao Zhang delete aijkok; 44076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45076ba34aSJunchao Zhang A->spptr = aijkok; 46076ba34aSJunchao Zhang } 47076ba34aSJunchao Zhang 48a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 49a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50a587d139SMark } 518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 528c3ff71bSJunchao Zhang } 538c3ff71bSJunchao Zhang 5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 568c3ff71bSJunchao Zhang { 578c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 588c3ff71bSJunchao Zhang 598c3ff71bSJunchao Zhang PetscFunctionBegin; 60*5f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61*5f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 63076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 64580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 668c3ff71bSJunchao Zhang } 678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang 70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7286a27549SJunchao Zhang { 7386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7486a27549SJunchao Zhang 7586a27549SJunchao Zhang PetscFunctionBegin; 76*5f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 7786a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 7886a27549SJunchao Zhang aijkok->a_dual.modify_device(); 7986a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8086a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 81*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 82*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 8386a27549SJunchao Zhang PetscFunctionReturn(0); 8486a27549SJunchao Zhang } 8586a27549SJunchao Zhang 86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87f0cf5187SStefano Zampini { 88f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89f0cf5187SStefano Zampini 90f0cf5187SStefano Zampini PetscFunctionBegin; 91f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9286a27549SJunchao Zhang /* We do not expect one needs factors on host */ 93*5f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94*5f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 96f0cf5187SStefano Zampini PetscFunctionReturn(0); 97f0cf5187SStefano Zampini } 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100f0cf5187SStefano Zampini { 101076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102f0cf5187SStefano Zampini 103f0cf5187SStefano Zampini PetscFunctionBegin; 104076ba34aSJunchao Zhang if (aijkok) { 105076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 106076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 107076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 108076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109076ba34aSJunchao Zhang } 110076ba34aSJunchao Zhang PetscFunctionReturn(0); 111076ba34aSJunchao Zhang } 112076ba34aSJunchao Zhang 113076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114076ba34aSJunchao Zhang { 115076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116076ba34aSJunchao Zhang 117076ba34aSJunchao Zhang PetscFunctionBegin; 118076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 119076ba34aSJunchao Zhang PetscFunctionReturn(0); 120076ba34aSJunchao Zhang } 121076ba34aSJunchao Zhang 122076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123076ba34aSJunchao Zhang { 124076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125076ba34aSJunchao Zhang 126076ba34aSJunchao Zhang PetscFunctionBegin; 1272328674fSJunchao Zhang if (aijkok) { 128076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 129076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1302328674fSJunchao Zhang } else { 1312328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1322328674fSJunchao Zhang } 133076ba34aSJunchao Zhang PetscFunctionReturn(0); 134076ba34aSJunchao Zhang } 135076ba34aSJunchao Zhang 136076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137076ba34aSJunchao Zhang { 138076ba34aSJunchao Zhang PetscFunctionBegin; 139076ba34aSJunchao Zhang *array = NULL; 140076ba34aSJunchao Zhang PetscFunctionReturn(0); 141076ba34aSJunchao Zhang } 142076ba34aSJunchao Zhang 143076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144076ba34aSJunchao Zhang { 145076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146076ba34aSJunchao Zhang 147076ba34aSJunchao Zhang PetscFunctionBegin; 1482328674fSJunchao Zhang if (aijkok) { 149076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1502328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1512328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1522328674fSJunchao Zhang } 153076ba34aSJunchao Zhang PetscFunctionReturn(0); 154076ba34aSJunchao Zhang } 155076ba34aSJunchao Zhang 156076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157076ba34aSJunchao Zhang { 158076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159076ba34aSJunchao Zhang 160076ba34aSJunchao Zhang PetscFunctionBegin; 1612328674fSJunchao Zhang if (aijkok) { 162076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 163076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1642328674fSJunchao Zhang } 165f0cf5187SStefano Zampini PetscFunctionReturn(0); 166f0cf5187SStefano Zampini } 167f0cf5187SStefano Zampini 168a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 169042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 170a587d139SMark { 171042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 172042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 173a587d139SMark 174a587d139SMark PetscFunctionBegin; 175*5f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 176152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 177a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 178a587d139SMark PetscFunctionReturn(0); 179a587d139SMark } 180a587d139SMark 181a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 182042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 183a587d139SMark { 184042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 185a587d139SMark 186a587d139SMark PetscFunctionBegin; 187a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 188a587d139SMark *d_mat = aijkok->device_mat_d.data(); 189a587d139SMark } else { 190*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 191a587d139SMark *d_mat = NULL; 192a587d139SMark } 193a587d139SMark PetscFunctionReturn(0); 194a587d139SMark } 195076ba34aSJunchao Zhang 196076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 197076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 198152b3e56SJunchao Zhang { 199152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 200152b3e56SJunchao Zhang 201152b3e56SJunchao Zhang PetscFunctionBegin; 202*5f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 203076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 204076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 205076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 206076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 207076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 20886a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 209076ba34aSJunchao Zhang } 210076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 211152b3e56SJunchao Zhang PetscFunctionReturn(0); 212152b3e56SJunchao Zhang } 213152b3e56SJunchao Zhang 214076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 215076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 216152b3e56SJunchao Zhang { 217152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 218152b3e56SJunchao Zhang 219152b3e56SJunchao Zhang PetscFunctionBegin; 220*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 221*5f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 222076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 223076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 224076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 225076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 226076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 227076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 228076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 229076ba34aSJunchao Zhang #endif 23086a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 231076ba34aSJunchao Zhang } 232076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 233*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 234152b3e56SJunchao Zhang PetscFunctionReturn(0); 235152b3e56SJunchao Zhang } 236a587d139SMark 2378c3ff71bSJunchao Zhang /* y = A x */ 2388c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2398c3ff71bSJunchao Zhang { 2408c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 241152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 242152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2438c3ff71bSJunchao Zhang 2448c3ff71bSJunchao Zhang PetscFunctionBegin; 245*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 246*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 247*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 248*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(yy,&yv)); 2498c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 250152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 251*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 252*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv)); 253076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 254*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 255*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 2568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2578c3ff71bSJunchao Zhang } 2588c3ff71bSJunchao Zhang 2598c3ff71bSJunchao Zhang /* y = A^T x */ 2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2618c3ff71bSJunchao Zhang { 2628c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 263152b3e56SJunchao Zhang const char *mode; 264152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 265152b3e56SJunchao Zhang PetscScalarKokkosView yv; 266076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2678c3ff71bSJunchao Zhang 2688c3ff71bSJunchao Zhang PetscFunctionBegin; 269*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 270*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 271*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 272*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(yy,&yv)); 273152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 274*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 275152b3e56SJunchao Zhang mode = "N"; 276152b3e56SJunchao Zhang } else { 277076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 278076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 279152b3e56SJunchao Zhang mode = "T"; 280152b3e56SJunchao Zhang } 281076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 282*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 283*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv)); 284*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz())); 285*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 2868c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2878c3ff71bSJunchao Zhang } 2888c3ff71bSJunchao Zhang 2898c3ff71bSJunchao Zhang /* y = A^H x */ 2908c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2918c3ff71bSJunchao Zhang { 2928c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 293152b3e56SJunchao Zhang const char *mode; 294152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 295152b3e56SJunchao Zhang PetscScalarKokkosView yv; 296076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2978c3ff71bSJunchao Zhang 2988c3ff71bSJunchao Zhang PetscFunctionBegin; 299*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 300*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 301*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 302*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(yy,&yv)); 303152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 304*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 305152b3e56SJunchao Zhang mode = "N"; 306152b3e56SJunchao Zhang } else { 307076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 308076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 309152b3e56SJunchao Zhang mode = "C"; 310152b3e56SJunchao Zhang } 311076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 312*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 313*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv)); 314*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz())); 315*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3178c3ff71bSJunchao Zhang } 3188c3ff71bSJunchao Zhang 3198c3ff71bSJunchao Zhang /* z = A x + y */ 3208c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3218c3ff71bSJunchao Zhang { 3228c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 323152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 324152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3258c3ff71bSJunchao Zhang 3268c3ff71bSJunchao Zhang PetscFunctionBegin; 327*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 328*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 329*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 330*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(yy,&yv)); 331*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(zz,&zv)); 3328c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3338c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 334152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 335*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 336*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(yy,&yv)); 337*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv)); 338*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 339*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3408c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3418c3ff71bSJunchao Zhang } 3428c3ff71bSJunchao Zhang 3438c3ff71bSJunchao Zhang /* z = A^T x + y */ 3448c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3458c3ff71bSJunchao Zhang { 3468c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 347152b3e56SJunchao Zhang const char *mode; 348152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 349152b3e56SJunchao Zhang PetscScalarKokkosView zv; 350076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3518c3ff71bSJunchao Zhang 3528c3ff71bSJunchao Zhang PetscFunctionBegin; 353*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 354*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 355*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 356*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(yy,&yv)); 357*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(zz,&zv)); 3588c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 359152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 360*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 361152b3e56SJunchao Zhang mode = "N"; 362152b3e56SJunchao Zhang } else { 363076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 364076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 365152b3e56SJunchao Zhang mode = "T"; 366152b3e56SJunchao Zhang } 367076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 368*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 369*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(yy,&yv)); 370*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv)); 371*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz())); 372*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 3738c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3748c3ff71bSJunchao Zhang } 3758c3ff71bSJunchao Zhang 3768c3ff71bSJunchao Zhang /* z = A^H x + y */ 3778c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3788c3ff71bSJunchao Zhang { 3798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 380152b3e56SJunchao Zhang const char *mode; 381152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 382152b3e56SJunchao Zhang PetscScalarKokkosView zv; 383076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3848c3ff71bSJunchao Zhang 3858c3ff71bSJunchao Zhang PetscFunctionBegin; 386*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 387*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 388*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(xx,&xv)); 389*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(yy,&yv)); 390*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(zz,&zv)); 3918c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 392152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 393*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 394152b3e56SJunchao Zhang mode = "N"; 395152b3e56SJunchao Zhang } else { 396076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 397076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 398152b3e56SJunchao Zhang mode = "C"; 399152b3e56SJunchao Zhang } 400076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 401*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(xx,&xv)); 402*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(yy,&yv)); 403*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv)); 404*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz())); 405*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 406152b3e56SJunchao Zhang PetscFunctionReturn(0); 407152b3e56SJunchao Zhang } 408152b3e56SJunchao Zhang 409152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 410152b3e56SJunchao Zhang { 411152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 412152b3e56SJunchao Zhang 413152b3e56SJunchao Zhang PetscFunctionBegin; 414152b3e56SJunchao Zhang switch (op) { 415152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 416152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 417*5f80ce2aSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) CHKERRQ(aijkok->DestroyMatTranspose()); 418152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 419152b3e56SJunchao Zhang break; 420152b3e56SJunchao Zhang default: 421*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOption_SeqAIJ(A,op,flg)); 422152b3e56SJunchao Zhang break; 423152b3e56SJunchao Zhang } 4248c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4258c3ff71bSJunchao Zhang } 4268c3ff71bSJunchao Zhang 427076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4283d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4298c3ff71bSJunchao Zhang { 430076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4318c3ff71bSJunchao Zhang 4328c3ff71bSJunchao Zhang PetscFunctionBegin; 433*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscKokkosInitializeCheck()); 434076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 435*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 4368c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 437*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 438076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 439*5f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 440*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(A->defaultvectype)); 441*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 442*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 443*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOps_SeqAIJKokkos(A)); 444076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 445394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 446*5f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 447076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4488c3ff71bSJunchao Zhang } 449076ba34aSJunchao Zhang } 4508c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4518c3ff71bSJunchao Zhang } 4528c3ff71bSJunchao Zhang 453076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 454076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 455076ba34aSJunchao Zhang */ 456076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4578c3ff71bSJunchao Zhang { 458076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 459076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 460076ba34aSJunchao Zhang Mat mat; 4618c3ff71bSJunchao Zhang 4628c3ff71bSJunchao Zhang PetscFunctionBegin; 463076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 464*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 465076ba34aSJunchao Zhang mat = *B; 466076ba34aSJunchao Zhang if (A->assembled) { 467076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 468076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 469076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 470076ba34aSJunchao Zhang /* Now copy values to B if needed */ 471076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 472076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 473076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 474076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 475076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 476076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 477076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 478076ba34aSJunchao Zhang } 479076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 480076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 481076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 482076ba34aSJunchao Zhang } 483076ba34aSJunchao Zhang mat->spptr = bkok; 484076ba34aSJunchao Zhang } 485076ba34aSJunchao Zhang 486*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(mat->defaultvectype)); 487*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 488*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 489*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetOps_SeqAIJKokkos(mat)); 4908c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4918c3ff71bSJunchao Zhang } 4928c3ff71bSJunchao Zhang 4930ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 4940ecb592aSJunchao Zhang { 4950ecb592aSJunchao Zhang Mat At; 496ff751488SJunchao Zhang KokkosCsrMatrix *internT; 4970ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 4980ecb592aSJunchao Zhang 4990ecb592aSJunchao Zhang PetscFunctionBegin; 500*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 5010ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 502ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 503ff751488SJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 504*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 5050ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 506*5f80ce2aSJacob Faibussowitsch else CHKERRQ(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 5070ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5080ecb592aSJunchao Zhang if ((*B)->assembled) { 5090ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5100ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 511*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(*B)); 5120ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5130ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5140ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5150ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5160ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5170ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5180ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5190ecb592aSJunchao Zhang } 5200ecb592aSJunchao Zhang PetscFunctionReturn(0); 5210ecb592aSJunchao Zhang } 5220ecb592aSJunchao Zhang 5238c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5248c3ff71bSJunchao Zhang { 52586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5268c3ff71bSJunchao Zhang 5278c3ff71bSJunchao Zhang PetscFunctionBegin; 52886a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 52986a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5308c3ff71bSJunchao Zhang delete aijkok; 53186a27549SJunchao Zhang } else { 53286a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 53386a27549SJunchao Zhang } 534cbc6b225SStefano Zampini A->spptr = NULL; 535*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 536*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 537*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 538*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy_SeqAIJ(A)); 5398c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5408c3ff71bSJunchao Zhang } 5418c3ff71bSJunchao Zhang 5423f3ba80aSJunchao Zhang /*MC 5433f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5443f3ba80aSJunchao Zhang 5453f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5463f3ba80aSJunchao Zhang 5473f3ba80aSJunchao Zhang Options Database Keys: 5483f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 5493f3ba80aSJunchao Zhang 5503f3ba80aSJunchao Zhang Level: beginner 5513f3ba80aSJunchao Zhang 5523f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 5533f3ba80aSJunchao Zhang M*/ 55486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 55586a27549SJunchao Zhang { 55686a27549SJunchao Zhang PetscFunctionBegin; 557*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscKokkosInitializeCheck()); 558*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate_SeqAIJ(A)); 559*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 56086a27549SJunchao Zhang PetscFunctionReturn(0); 56186a27549SJunchao Zhang } 56286a27549SJunchao Zhang 563076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 564076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 565a3f881fbSStefano Zampini { 566076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 567076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 568076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 569076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 570076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 571076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 572a3f881fbSStefano Zampini 573a3f881fbSStefano Zampini PetscFunctionBegin; 574076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 575076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 576076ba34aSJunchao Zhang PetscValidPointer(C,4); 577076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 578076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 579*5f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 580*5f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 581076ba34aSJunchao Zhang 582*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 583*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); 584076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 585076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 586076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 587076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 588076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 589076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 590076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 591076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 592076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 593076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 594076ba34aSJunchao Zhang nnz = a->nz + b->nz; 595076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 596076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 597076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 598076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 599076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 600076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 601076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 602076ba34aSJunchao Zhang ca = ca_dual.view_device(); 603076ba34aSJunchao Zhang ci = ci_dual.view_device(); 604076ba34aSJunchao Zhang cj = cj_dual.view_device(); 605076ba34aSJunchao Zhang 606076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 607076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 608076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 609076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 610076ba34aSJunchao Zhang 611076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 612076ba34aSJunchao Zhang ci(i) = coffset; 613076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 614076ba34aSJunchao Zhang }); 615076ba34aSJunchao Zhang 616076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 617076ba34aSJunchao Zhang if (k < alen) { 618076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 619076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 620076ba34aSJunchao Zhang } else { 621076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 622076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 623076ba34aSJunchao Zhang } 624076ba34aSJunchao Zhang }); 625076ba34aSJunchao Zhang }); 626076ba34aSJunchao Zhang ca_dual.modify_device(); 627076ba34aSJunchao Zhang ci_dual.modify_device(); 628076ba34aSJunchao Zhang cj_dual.modify_device(); 629076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 630*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 631076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 632076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 633076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 634076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 635076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 636076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 637076ba34aSJunchao Zhang 638076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 639076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 640076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 641076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 642076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 643076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 644076ba34aSJunchao Zhang }); 645076ba34aSJunchao Zhang }); 646*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(*C)); 647076ba34aSJunchao Zhang } 648076ba34aSJunchao Zhang PetscFunctionReturn(0); 649076ba34aSJunchao Zhang } 650076ba34aSJunchao Zhang 651076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 652076ba34aSJunchao Zhang { 653076ba34aSJunchao Zhang PetscFunctionBegin; 654076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 655a3f881fbSStefano Zampini PetscFunctionReturn(0); 656a3f881fbSStefano Zampini } 657a3f881fbSStefano Zampini 658a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 659a3f881fbSStefano Zampini { 660a3f881fbSStefano Zampini Mat_Product *product = C->product; 661a3f881fbSStefano Zampini Mat A,B; 662076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 663a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 664a3f881fbSStefano Zampini Mat_SeqAIJ *c; 665076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 666076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 667a3f881fbSStefano Zampini 668a3f881fbSStefano Zampini PetscFunctionBegin; 669a3f881fbSStefano Zampini MatCheckProduct(C,1); 670*5f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 671076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 672076ba34aSJunchao Zhang 673076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 674076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 675076ba34aSJunchao Zhang PetscFunctionReturn(0); 676076ba34aSJunchao Zhang } 677076ba34aSJunchao Zhang 678076ba34aSJunchao Zhang switch (product->type) { 679076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 680076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 681076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 682076ba34aSJunchao Zhang default: 68398921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 684076ba34aSJunchao Zhang } 685076ba34aSJunchao Zhang 686a3f881fbSStefano Zampini A = product->A; 687a3f881fbSStefano Zampini B = product->B; 688*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 689*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); 690a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 691a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 692a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 693076ba34aSJunchao Zhang 694*5f80ce2aSJacob Faibussowitsch PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 695076ba34aSJunchao Zhang 696076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 697076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 698076ba34aSJunchao Zhang 699076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 700076ba34aSJunchao Zhang if (transA) { 701*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 702076ba34aSJunchao Zhang transA = false; 703a3f881fbSStefano Zampini } 704a3f881fbSStefano Zampini 705076ba34aSJunchao Zhang if (transB) { 706*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 707076ba34aSJunchao Zhang transB = false; 708076ba34aSJunchao Zhang } 709*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 710076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 711076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 712*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 713*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(C)); 714a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 715a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 716*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 717*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 718*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 719a3f881fbSStefano Zampini c->reallocs = 0; 720076ba34aSJunchao Zhang C->info.mallocs = 0; 721a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 722a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 723a3f881fbSStefano Zampini C->num_ass++; 724a3f881fbSStefano Zampini PetscFunctionReturn(0); 725a3f881fbSStefano Zampini } 726a3f881fbSStefano Zampini 727a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 728a3f881fbSStefano Zampini { 729076ba34aSJunchao Zhang Mat_Product *product = C->product; 730076ba34aSJunchao Zhang MatProductType ptype; 731076ba34aSJunchao Zhang Mat A,B; 732076ba34aSJunchao Zhang bool transA,transB; 733076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 734076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 735076ba34aSJunchao Zhang MPI_Comm comm; 736076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 737a3f881fbSStefano Zampini 738a3f881fbSStefano Zampini PetscFunctionBegin; 739a3f881fbSStefano Zampini MatCheckProduct(C,1); 740*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm)); 741*5f80ce2aSJacob Faibussowitsch PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 742a3f881fbSStefano Zampini A = product->A; 743a3f881fbSStefano Zampini B = product->B; 744*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 745*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); 746a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 747a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 748076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 749076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 750076ba34aSJunchao Zhang 751a3f881fbSStefano Zampini ptype = product->type; 752a3f881fbSStefano Zampini switch (ptype) { 753076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 754076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 755076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 756a3f881fbSStefano Zampini default: 75798921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 758a3f881fbSStefano Zampini } 759a3f881fbSStefano Zampini 760076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 761076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 762076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 763076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 764a3f881fbSStefano Zampini 765076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 766076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7676ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7686ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7696ffb9508SJunchao Zhang */ 770076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 771076ba34aSJunchao Zhang 772*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 773076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 774076ba34aSJunchao Zhang if (transA) { 775*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 776076ba34aSJunchao Zhang transA = false; 777076ba34aSJunchao Zhang } 778076ba34aSJunchao Zhang 779076ba34aSJunchao Zhang if (transB) { 780*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 781076ba34aSJunchao Zhang transB = false; 782076ba34aSJunchao Zhang } 783076ba34aSJunchao Zhang 784076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 785076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 786076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 787076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 788076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 789076ba34aSJunchao Zhang */ 790076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 791076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 792*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 793076ba34aSJunchao Zhang 794076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 795*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 796076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 797a3f881fbSStefano Zampini PetscFunctionReturn(0); 798a3f881fbSStefano Zampini } 799a3f881fbSStefano Zampini 800a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 801076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 802a3f881fbSStefano Zampini { 803076ba34aSJunchao Zhang Mat_Product *product = mat->product; 804a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 805a3f881fbSStefano Zampini 806a3f881fbSStefano Zampini PetscFunctionBegin; 807a3f881fbSStefano Zampini MatCheckProduct(mat,1); 808*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 809a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 810*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 811a3f881fbSStefano Zampini } 812a3f881fbSStefano Zampini if (Biskok && Ciskok) { 813a3f881fbSStefano Zampini switch (product->type) { 814a3f881fbSStefano Zampini case MATPRODUCT_AB: 815a3f881fbSStefano Zampini case MATPRODUCT_AtB: 816a3f881fbSStefano Zampini case MATPRODUCT_ABt: 817a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 818a3f881fbSStefano Zampini break; 819a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 820a3f881fbSStefano Zampini case MATPRODUCT_RARt: 821a3f881fbSStefano Zampini case MATPRODUCT_ABC: 822a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 823a3f881fbSStefano Zampini break; 824a3f881fbSStefano Zampini default: 825a3f881fbSStefano Zampini break; 826a3f881fbSStefano Zampini } 827a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 828*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat)); 829a3f881fbSStefano Zampini } 830a3f881fbSStefano Zampini PetscFunctionReturn(0); 831a3f881fbSStefano Zampini } 832a587d139SMark 833f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 834f0cf5187SStefano Zampini { 835f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 836f0cf5187SStefano Zampini 837f0cf5187SStefano Zampini PetscFunctionBegin; 838*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 839*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 840f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 841076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 842*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(A)); 843*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 844*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 845f0cf5187SStefano Zampini PetscFunctionReturn(0); 846f0cf5187SStefano Zampini } 847f0cf5187SStefano Zampini 848a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 849a587d139SMark { 850076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 851a587d139SMark 852a587d139SMark PetscFunctionBegin; 853076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8542328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 855076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 856*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(A)); 8572328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 858*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatZeroEntries_SeqAIJ(A)); 8592328674fSJunchao Zhang } 860a587d139SMark PetscFunctionReturn(0); 861a587d139SMark } 862a587d139SMark 863db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 86442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 865db78de30SJunchao Zhang { 866db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 867db78de30SJunchao Zhang 868db78de30SJunchao Zhang PetscFunctionBegin; 869db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 870db78de30SJunchao Zhang PetscValidPointer(kv,2); 871db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 872*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 873db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 874076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 875db78de30SJunchao Zhang PetscFunctionReturn(0); 876db78de30SJunchao Zhang } 877db78de30SJunchao Zhang 87842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 879db78de30SJunchao Zhang { 880db78de30SJunchao Zhang PetscFunctionBegin; 881db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 882db78de30SJunchao Zhang PetscValidPointer(kv,2); 883db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 884db78de30SJunchao Zhang PetscFunctionReturn(0); 885db78de30SJunchao Zhang } 886db78de30SJunchao Zhang 88742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 888db78de30SJunchao Zhang { 889db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 890db78de30SJunchao Zhang 891db78de30SJunchao Zhang PetscFunctionBegin; 892db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 893db78de30SJunchao Zhang PetscValidPointer(kv,2); 894db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 895*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 896db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 897076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 898db78de30SJunchao Zhang PetscFunctionReturn(0); 899db78de30SJunchao Zhang } 900db78de30SJunchao Zhang 90142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 902db78de30SJunchao Zhang { 903db78de30SJunchao Zhang PetscFunctionBegin; 904db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 905db78de30SJunchao Zhang PetscValidPointer(kv,2); 906db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 907*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(A)); 908db78de30SJunchao Zhang PetscFunctionReturn(0); 909db78de30SJunchao Zhang } 910db78de30SJunchao Zhang 91142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 912db78de30SJunchao Zhang { 913db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 914db78de30SJunchao Zhang 915db78de30SJunchao Zhang PetscFunctionBegin; 916db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 917db78de30SJunchao Zhang PetscValidPointer(kv,2); 918db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 919db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 920076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 921db78de30SJunchao Zhang PetscFunctionReturn(0); 922db78de30SJunchao Zhang } 923db78de30SJunchao Zhang 92442550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 925db78de30SJunchao Zhang { 926db78de30SJunchao Zhang PetscFunctionBegin; 927db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928db78de30SJunchao Zhang PetscValidPointer(kv,2); 929db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(A)); 931db78de30SJunchao Zhang PetscFunctionReturn(0); 932db78de30SJunchao Zhang } 933db78de30SJunchao Zhang 934c17cf699SJunchao Zhang /* Computes Y += alpha X */ 935c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 936a587d139SMark { 937a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 938c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 939c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 940c17cf699SJunchao Zhang MatScalarKokkosView Ya; 941a587d139SMark 942a587d139SMark PetscFunctionBegin; 943c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 944c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 945*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(Y)); 946*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(X)); 947*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 948db78de30SJunchao Zhang 949c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 950c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 951a587d139SMark PetscBool e; 952*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 953a587d139SMark if (e) { 954*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscArraycmp(x->j,y->j,y->nz,&e)); 955c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 956a587d139SMark } 957a587d139SMark } 958db78de30SJunchao Zhang 959c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 960c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 961c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 962c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 963c17cf699SJunchao Zhang */ 964c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 965c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 966c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 967c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 968c17cf699SJunchao Zhang 969c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 970c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 971*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(Y)); 972c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 973c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 974c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 975c17cf699SJunchao Zhang 976c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 977c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 978c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 979c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 980c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 981c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 982c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 983c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 984c17cf699SJunchao Zhang q++; 985a587d139SMark } else { 986c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 987c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 988c17cf699SJunchao Zhang */ 989c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 990a587d139SMark } 991c17cf699SJunchao Zhang } 992c17cf699SJunchao Zhang }); 993c17cf699SJunchao Zhang }); 994*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosModifyDevice(Y)); 995c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 996c17cf699SJunchao Zhang Mat Z; 997c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 998c17cf699SJunchao Zhang KernelHandle kh; 999c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1000c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1001c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1002c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1003*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1004*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatHeaderReplace(Y,&Z)); 1005c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1006c17cf699SJunchao Zhang } 1007*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 1008*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1009a587d139SMark PetscFunctionReturn(0); 1010a587d139SMark } 1011a587d139SMark 1012394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 101342550becSJunchao Zhang { 101442550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 101542550becSJunchao Zhang Mat_SeqAIJ *aseq; 101642550becSJunchao Zhang 101742550becSJunchao Zhang PetscFunctionBegin; 1018*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1019394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 102042550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1021cbc6b225SStefano Zampini delete akok; 1022cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1023*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatZeroEntries_SeqAIJKokkos(mat)); 1024394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 102542550becSJunchao Zhang PetscFunctionReturn(0); 102642550becSJunchao Zhang } 102742550becSJunchao Zhang 102842550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 102942550becSJunchao Zhang { 103042550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 103142550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1032394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1033394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1034394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 103542550becSJunchao Zhang MatScalarKokkosView Aa; 103642550becSJunchao Zhang ConstMatScalarKokkosView kv; 103742550becSJunchao Zhang PetscMemType memtype; 103842550becSJunchao Zhang 103942550becSJunchao Zhang PetscFunctionBegin; 1040*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscGetMemType(v,&memtype)); 104142550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1042394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 104342550becSJunchao Zhang } else { 1044394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 104542550becSJunchao Zhang } 104642550becSJunchao Zhang 1047cbc6b225SStefano Zampini if (imode == INSERT_VALUES) { 1048*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1049cbc6b225SStefano Zampini Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1050*5f80ce2aSJacob Faibussowitsch } else CHKERRQ(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 105142550becSJunchao Zhang 1052cbc6b225SStefano Zampini Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1053394ed5ebSJunchao Zhang 1054*5f80ce2aSJacob Faibussowitsch if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1055*5f80ce2aSJacob Faibussowitsch else CHKERRQ(MatSeqAIJRestoreKokkosView(A,&Aa)); 105642550becSJunchao Zhang PetscFunctionReturn(0); 105742550becSJunchao Zhang } 105842550becSJunchao Zhang 105986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10608f7e8f9dSMark Adams { 10618f7e8f9dSMark Adams PetscFunctionBegin; 1062*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncHost(A)); 1063*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info)); 10648f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10658f7e8f9dSMark Adams PetscFunctionReturn(0); 10668f7e8f9dSMark Adams } 10678f7e8f9dSMark Adams 10688c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10698c3ff71bSJunchao Zhang { 1070076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1071076ba34aSJunchao Zhang 10728c3ff71bSJunchao Zhang PetscFunctionBegin; 1073076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 10746f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 10756f3d89d0SStefano Zampini 10768c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10778c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10788c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1079a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1080f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1081a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1082076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 10838c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 10848c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 10858c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 10868c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 10878c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 10888c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1089076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 10900ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1091152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1092076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1093076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1094076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1095076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1096076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1097076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 109842550becSJunchao Zhang 1099*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1100*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1101076ba34aSJunchao Zhang PetscFunctionReturn(0); 1102076ba34aSJunchao Zhang } 1103076ba34aSJunchao Zhang 1104076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1105076ba34aSJunchao Zhang { 1106076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1107076ba34aSJunchao Zhang PetscInt i,m,n; 1108076ba34aSJunchao Zhang 1109076ba34aSJunchao Zhang PetscFunctionBegin; 1110*5f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1111076ba34aSJunchao Zhang 1112076ba34aSJunchao Zhang m = akok->nrows(); 1113076ba34aSJunchao Zhang n = akok->ncols(); 1114*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(A,m,n,m,n)); 1115*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(A,MATSEQAIJKOKKOS)); 1116076ba34aSJunchao Zhang 1117076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1118*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1119076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1120076ba34aSJunchao Zhang 1121076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1122076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1123076ba34aSJunchao Zhang 1124076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1125076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1126076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1127076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1128076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1129076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1130076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1131076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1132076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1133076ba34aSJunchao Zhang 1134*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&aseq->imax)); 1135*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(m,&aseq->ilen)); 1136076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1137076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1138076ba34aSJunchao Zhang } 1139076ba34aSJunchao Zhang 1140076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1141076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1142ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1143*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1144*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1145076ba34aSJunchao Zhang PetscFunctionReturn(0); 1146076ba34aSJunchao Zhang } 1147076ba34aSJunchao Zhang 1148076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1149076ba34aSJunchao Zhang 1150076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1151076ba34aSJunchao Zhang */ 1152076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1153076ba34aSJunchao Zhang { 1154076ba34aSJunchao Zhang PetscFunctionBegin; 1155*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(comm,A)); 1156*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 11578c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11588c3ff71bSJunchao Zhang } 11598c3ff71bSJunchao Zhang 11608c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1161152b3e56SJunchao Zhang /*@C 11628c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11638c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11648c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11658c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 11668c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 11678c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 11688c3ff71bSJunchao Zhang 11698c3ff71bSJunchao Zhang Collective 11708c3ff71bSJunchao Zhang 11718c3ff71bSJunchao Zhang Input Parameters: 11728c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 11738c3ff71bSJunchao Zhang . m - number of rows 11748c3ff71bSJunchao Zhang . n - number of columns 11758c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 11768c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 11778c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11788c3ff71bSJunchao Zhang 11798c3ff71bSJunchao Zhang Output Parameter: 11808c3ff71bSJunchao Zhang . A - the matrix 11818c3ff71bSJunchao Zhang 11828c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 11838c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 11848c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 11858c3ff71bSJunchao Zhang 11868c3ff71bSJunchao Zhang Notes: 11878c3ff71bSJunchao Zhang If nnz is given then nz is ignored 11888c3ff71bSJunchao Zhang 11898c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 11908c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 11918c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 11928c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 11938c3ff71bSJunchao Zhang 11948c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 11958c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11968c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 11978c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 11988c3ff71bSJunchao Zhang 11998c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12008c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12018c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12028c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12038c3ff71bSJunchao Zhang 12048c3ff71bSJunchao Zhang Level: intermediate 12058c3ff71bSJunchao Zhang 1206076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 12078c3ff71bSJunchao Zhang @*/ 12088c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12098c3ff71bSJunchao Zhang { 12108c3ff71bSJunchao Zhang PetscFunctionBegin; 1211*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscKokkosInitializeCheck()); 1212*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(comm,A)); 1213*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*A,m,n,m,n)); 1214*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*A,MATSEQAIJKOKKOS)); 1215*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 12168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12178c3ff71bSJunchao Zhang } 1218930e68a5SMark Adams 12198f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12208f7e8f9dSMark Adams // 122146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12228f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12238f7e8f9dSMark Adams // 12248f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1225930e68a5SMark Adams { 12268f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12278f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12288f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 12298f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1230300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1231076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1232076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12338f7e8f9dSMark Adams PetscBool row_identity,col_identity; 123446804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1235930e68a5SMark Adams 1236930e68a5SMark Adams PetscFunctionBegin; 12372c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1238*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 12392c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1240*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(isrow,&r_h)); 1241*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetIndices(isicol,&ic_h)); 1242*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISGetSize(isicol,&nc)); 1243*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1244*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 12458f7e8f9dSMark Adams { 12468f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12478f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12488f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12498f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12508f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12518f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12528f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12538f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12548f7e8f9dSMark Adams size_t flops_h = 0.0; 12558f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12568f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12578f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12588f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12598f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12608f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12618f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12628f7e8f9dSMark Adams // Fill A --> fact 12638f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1264042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12658f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12668f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12678f7e8f9dSMark Adams // zero rows of B 12688f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12698f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12708f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12718f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12728f7e8f9dSMark Adams /* zero (unfactored row) */ 12738f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12748f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12758f7e8f9dSMark Adams }); 12768f7e8f9dSMark Adams // copy A into B 12778f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12788f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12798f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12808f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12818f7e8f9dSMark Adams /* load in initial (unfactored row) */ 12828f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 12838f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 12848f7e8f9dSMark Adams PetscScalar vala = av[j]; 12858f7e8f9dSMark Adams if (colb == rowb) { 12868f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 12878f7e8f9dSMark Adams } else { 12888f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12898f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12908f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 12918f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 12928f7e8f9dSMark Adams if (pbj[j] == colb) { 12938f7e8f9dSMark Adams pba[j] = vala; 12948f7e8f9dSMark Adams set++; 12958f7e8f9dSMark Adams break; 12968f7e8f9dSMark Adams } 12978f7e8f9dSMark Adams } 12988f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 12998f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13008f1da0b2SJunchao Zhang #endif 13018f7e8f9dSMark Adams } 13028f7e8f9dSMark Adams } 13038f7e8f9dSMark Adams }); 13048f7e8f9dSMark Adams }); 13058f7e8f9dSMark Adams Kokkos::fence(); 1306930e68a5SMark Adams 13078f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13088f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13098f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13108f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1311042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13128f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 13138f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13148f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13158f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 13168f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 13178f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 13188f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 13198f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13208f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13218f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13228f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13238f7e8f9dSMark Adams else { 13248f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13258f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13268f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13278f7e8f9dSMark Adams size_t st_idx; 13288f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13298f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13308f7e8f9dSMark Adams // get column, there has got to be a better way 13318f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13328f7e8f9dSMark Adams if (pjL[j] == ii) { 13338f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13348f7e8f9dSMark Adams idx = j; // output 13358f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13368f7e8f9dSMark Adams } 13378f7e8f9dSMark Adams }, st_idx); 13388f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13398f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 134099551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 134199551766SMark Adams #endif 134299551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13438f7e8f9dSMark Adams // U(i+1,:end) 13448f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13458f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13468f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13478f7e8f9dSMark Adams if (col==myk) { 13488f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13498f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13508f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13518f7e8f9dSMark Adams } else { 13528f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13538f7e8f9dSMark Adams PetscInt high, low; 13548f7e8f9dSMark Adams const PetscInt *startj; 13558f7e8f9dSMark Adams if (col<myk) { // L 13568f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13578f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13588f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13598f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13608f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13618f7e8f9dSMark Adams } else { 13628f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 13638f7e8f9dSMark Adams start = ba_d + idx; 13648f7e8f9dSMark Adams startj= bj_d + idx; 13658f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13668f7e8f9dSMark Adams } 13678f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13688f7e8f9dSMark Adams low = 0; 13698f7e8f9dSMark Adams high = (PetscInt)(end-start); 13708f7e8f9dSMark Adams while (high-low > 5) { 13718f7e8f9dSMark Adams int t = (low+high)/2; 13728f7e8f9dSMark Adams if (startj[t] > col) high = t; 13738f7e8f9dSMark Adams else low = t; 13748f7e8f9dSMark Adams } 13758f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13768f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13778f7e8f9dSMark Adams } 13788f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 137999551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 138099551766SMark Adams #endif 13818f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 13828f7e8f9dSMark Adams } 13838f7e8f9dSMark Adams }); 13848f7e8f9dSMark Adams } 13858f7e8f9dSMark Adams }); 13868f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 13878f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 13888f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 13898f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 13908f7e8f9dSMark Adams }); 13918f7e8f9dSMark Adams Kokkos::fence(); 13928f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 1393*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 13948f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 13958f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 13968f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 13978f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 13988f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 13998f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14008f7e8f9dSMark Adams if (i < end) { 14018f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14028f7e8f9dSMark Adams *pv = 1.0/(*pv); 14038f7e8f9dSMark Adams } 14048f7e8f9dSMark Adams }); 14058f7e8f9dSMark Adams }); 14068f7e8f9dSMark Adams } 1407*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 1408*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(isicol,&ic_h)); 1409*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISRestoreIndices(isrow,&r_h)); 14108f7e8f9dSMark Adams 1411*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(isrow,&row_identity)); 1412*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISIdentity(isicol,&col_identity)); 14138f7e8f9dSMark Adams if (b->inode.size) { 14148f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14158f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14168f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14178f7e8f9dSMark Adams } else { 14188f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14198f7e8f9dSMark Adams } 14208f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 1421*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 14228f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14238f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14248f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14258f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14268f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14278f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14288f7e8f9dSMark Adams 1429930e68a5SMark Adams PetscFunctionReturn(0); 1430930e68a5SMark Adams } 1431930e68a5SMark Adams 143286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1433930e68a5SMark Adams { 1434930e68a5SMark Adams PetscFunctionBegin; 1435*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 143686a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 143786a27549SJunchao Zhang PetscFunctionReturn(0); 143886a27549SJunchao Zhang } 143986a27549SJunchao Zhang 144086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 144186a27549SJunchao Zhang { 144286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 144386a27549SJunchao Zhang 144486a27549SJunchao Zhang PetscFunctionBegin; 144586a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 144686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 144786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 144886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 144986a27549SJunchao Zhang } 145086a27549SJunchao Zhang PetscFunctionReturn(0); 145186a27549SJunchao Zhang } 145286a27549SJunchao Zhang 145386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 145486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 145586a27549SJunchao Zhang { 145686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1457076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 145886a27549SJunchao Zhang 145986a27549SJunchao Zhang PetscFunctionBegin; 146086a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 146186a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1462076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 146386a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1464076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1465076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 146686a27549SJunchao Zhang 146786a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1468076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1469076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1470076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 147186a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 147286a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 147386a27549SJunchao Zhang 147486a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 147586a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 147686a27549SJunchao Zhang */ 1477076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1478076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 147986a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 148086a27549SJunchao Zhang 148186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 148286a27549SJunchao Zhang 148386a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1484076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 148586a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1486076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1487076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 148886a27549SJunchao Zhang 148986a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1490076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1491076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1492076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 149386a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 149486a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 149586a27549SJunchao Zhang 149686a27549SJunchao Zhang /* Sort indices. See comments above */ 1497076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1498076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 149986a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 150086a27549SJunchao Zhang 150186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 150286a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 150386a27549SJunchao Zhang } 150486a27549SJunchao Zhang PetscFunctionReturn(0); 150586a27549SJunchao Zhang } 150686a27549SJunchao Zhang 150786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 150886a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 150986a27549SJunchao Zhang { 151086a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 151186a27549SJunchao Zhang PetscScalarKokkosView xv; 151286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 151386a27549SJunchao Zhang 151486a27549SJunchao Zhang PetscFunctionBegin; 1515*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1516*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1517*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(b,&bv)); 1518*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(x,&xv)); 151986a27549SJunchao Zhang /* Solve L tmpv = b */ 15203f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 152186a27549SJunchao Zhang /* Solve Ux = tmpv */ 15223f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1523*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(b,&bv)); 1524*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(x,&xv)); 1525*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 152686a27549SJunchao Zhang PetscFunctionReturn(0); 152786a27549SJunchao Zhang } 152886a27549SJunchao Zhang 1529076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 153086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 153186a27549SJunchao Zhang { 153286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 153386a27549SJunchao Zhang PetscScalarKokkosView xv; 153486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 153586a27549SJunchao Zhang 153686a27549SJunchao Zhang PetscFunctionBegin; 1537*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1538*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosTransposeSolveCheck(A)); 1539*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosView(b,&bv)); 1540*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetKokkosViewWrite(x,&xv)); 154186a27549SJunchao Zhang /* Solve U^T tmpv = b */ 154286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 154386a27549SJunchao Zhang 154486a27549SJunchao Zhang /* Solve L^T x = tmpv */ 154586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1546*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosView(b,&bv)); 1547*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreKokkosViewWrite(x,&xv)); 1548*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 154986a27549SJunchao Zhang PetscFunctionReturn(0); 155086a27549SJunchao Zhang } 155186a27549SJunchao Zhang 155286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 155386a27549SJunchao Zhang { 155486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 155586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 155686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 155786a27549SJunchao Zhang 155886a27549SJunchao Zhang PetscFunctionBegin; 1559*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1560*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 1561076ba34aSJunchao Zhang 1562076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1563076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1564076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1565076ba34aSJunchao Zhang 1566076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 156786a27549SJunchao Zhang 156886a27549SJunchao Zhang B->assembled = PETSC_TRUE; 156986a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 157086a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 157186a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 157286a27549SJunchao Zhang B->ops->matsolve = NULL; 157386a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 157486a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 157586a27549SJunchao Zhang 157686a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 157786a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 157886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1579eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1580*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 158186a27549SJunchao Zhang PetscFunctionReturn(0); 158286a27549SJunchao Zhang } 158386a27549SJunchao Zhang 158486a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 158586a27549SJunchao Zhang { 158686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 158786a27549SJunchao Zhang Mat_SeqAIJ *b; 158886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 158986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 159086a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 159186a27549SJunchao Zhang PetscInt n = A->rmap->n; 159286a27549SJunchao Zhang 159386a27549SJunchao Zhang PetscFunctionBegin; 1594*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); 159586a27549SJunchao Zhang /* Rebuild factors */ 159686a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 159786a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 159886a27549SJunchao Zhang 159986a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 160086a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 160186a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 160286a27549SJunchao Zhang 160386a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 160486a27549SJunchao Zhang 160586a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 160686a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 160786a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 160886a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 160986a27549SJunchao Zhang 161086a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1611076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1612076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1613076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 161486a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 161586a27549SJunchao Zhang 161686a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 161786a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 161886a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 161986a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 162086a27549SJunchao Zhang 162186a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 162286a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 162386a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 162486a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 162586a27549SJunchao Zhang #else 162686a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 162786a27549SJunchao Zhang #endif 162886a27549SJunchao Zhang 162986a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 163086a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 163186a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 163286a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 163386a27549SJunchao Zhang 163486a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 1635*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 163686a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 163786a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 163886a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 163986a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 164086a27549SJunchao Zhang 164186a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 164286a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1643930e68a5SMark Adams PetscFunctionReturn(0); 1644930e68a5SMark Adams } 1645930e68a5SMark Adams 16468f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16478f7e8f9dSMark Adams { 16488f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16498f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1650930e68a5SMark Adams 16518f7e8f9dSMark Adams PetscFunctionBegin; 1652*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 16538f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16548f7e8f9dSMark Adams // move B data into Kokkos 1655*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1656*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 16578f7e8f9dSMark Adams { 16588f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1659300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 16608f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1661300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1662300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 16638f7e8f9dSMark Adams } 16648f7e8f9dSMark Adams } 16658f7e8f9dSMark Adams PetscFunctionReturn(0); 16668f7e8f9dSMark Adams } 16678f7e8f9dSMark Adams 166886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1669930e68a5SMark Adams { 1670930e68a5SMark Adams PetscFunctionBegin; 1671930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1672930e68a5SMark Adams PetscFunctionReturn(0); 1673930e68a5SMark Adams } 1674930e68a5SMark Adams 16758f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16768f7e8f9dSMark Adams { 16778f7e8f9dSMark Adams PetscFunctionBegin; 16788f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16798f7e8f9dSMark Adams PetscFunctionReturn(0); 16808f7e8f9dSMark Adams } 16818f7e8f9dSMark Adams 1682930e68a5SMark Adams /*MC 168386a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 168486a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1685930e68a5SMark Adams 1686930e68a5SMark Adams Level: beginner 1687930e68a5SMark Adams 16883f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1689930e68a5SMark Adams M*/ 169086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1691930e68a5SMark Adams { 1692930e68a5SMark Adams PetscInt n = A->rmap->n; 1693930e68a5SMark Adams 1694930e68a5SMark Adams PetscFunctionBegin; 1695*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B)); 1696*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*B,n,n,n,n)); 1697930e68a5SMark Adams (*B)->factortype = ftype; 1698*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1699*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*B,MATSEQAIJKOKKOS)); 1700930e68a5SMark Adams 17018f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1702*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 170386a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 170486a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 170586a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 1706*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 170786a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 170886a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 170998921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1710930e68a5SMark Adams 1711*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1712*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1713930e68a5SMark Adams PetscFunctionReturn(0); 1714930e68a5SMark Adams } 17158f7e8f9dSMark Adams 17168f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17178f7e8f9dSMark Adams { 17188f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17198f7e8f9dSMark Adams 17208f7e8f9dSMark Adams PetscFunctionBegin; 1721*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B)); 1722*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*B,n,n,n,n)); 17238f7e8f9dSMark Adams (*B)->factortype = ftype; 1724f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 1725*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1726*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*B,MATSEQAIJKOKKOS)); 17278f7e8f9dSMark Adams 17288f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1729*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 17308f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17318f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17328f7e8f9dSMark Adams 1733*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1734*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 17358f7e8f9dSMark Adams PetscFunctionReturn(0); 17368f7e8f9dSMark Adams } 173786a27549SJunchao Zhang 173886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 173986a27549SJunchao Zhang { 174086a27549SJunchao Zhang PetscFunctionBegin; 1741*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1742*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1743*5f80ce2aSJacob Faibussowitsch CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 174486a27549SJunchao Zhang PetscFunctionReturn(0); 174586a27549SJunchao Zhang } 174686a27549SJunchao Zhang 1747076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1748076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1749076ba34aSJunchao Zhang { 1750076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1751076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1752076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1753076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1754076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1755076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1756076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1757076ba34aSJunchao Zhang 1758076ba34aSJunchao Zhang PetscFunctionBegin; 1759*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1760076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1761*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1762076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1763*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p])); 1764076ba34aSJunchao Zhang } 1765*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"\n")); 1766076ba34aSJunchao Zhang } 1767076ba34aSJunchao Zhang PetscFunctionReturn(0); 1768076ba34aSJunchao Zhang } 1769