111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 28076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 29076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 308c3ff71bSJunchao Zhang 318c3ff71bSJunchao Zhang PetscFunctionBegin; 32076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 339566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34076ba34aSJunchao Zhang 35076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37076ba34aSJunchao Zhang 38076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 39076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 41076ba34aSJunchao Zhang */ 42076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43076ba34aSJunchao Zhang delete aijkok; 44076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45076ba34aSJunchao Zhang A->spptr = aijkok; 46076ba34aSJunchao Zhang } 47076ba34aSJunchao Zhang 48a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 49a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50a587d139SMark } 518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 528c3ff71bSJunchao Zhang } 538c3ff71bSJunchao Zhang 5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 568c3ff71bSJunchao Zhang { 578c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 588c3ff71bSJunchao Zhang 598c3ff71bSJunchao Zhang PetscFunctionBegin; 605f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 615f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 63076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 64580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 668c3ff71bSJunchao Zhang } 678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang 70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7286a27549SJunchao Zhang { 7386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7486a27549SJunchao Zhang 7586a27549SJunchao Zhang PetscFunctionBegin; 765f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 7786a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 7886a27549SJunchao Zhang aijkok->a_dual.modify_device(); 7986a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8086a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 829566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 8386a27549SJunchao Zhang PetscFunctionReturn(0); 8486a27549SJunchao Zhang } 8586a27549SJunchao Zhang 86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87f0cf5187SStefano Zampini { 88f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89f0cf5187SStefano Zampini 90f0cf5187SStefano Zampini PetscFunctionBegin; 91f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9286a27549SJunchao Zhang /* We do not expect one needs factors on host */ 935f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 945f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 96f0cf5187SStefano Zampini PetscFunctionReturn(0); 97f0cf5187SStefano Zampini } 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100f0cf5187SStefano Zampini { 101076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102f0cf5187SStefano Zampini 103f0cf5187SStefano Zampini PetscFunctionBegin; 104076ba34aSJunchao Zhang if (aijkok) { 105076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 106076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 107076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 108076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109076ba34aSJunchao Zhang } 110076ba34aSJunchao Zhang PetscFunctionReturn(0); 111076ba34aSJunchao Zhang } 112076ba34aSJunchao Zhang 113076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114076ba34aSJunchao Zhang { 115076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116076ba34aSJunchao Zhang 117076ba34aSJunchao Zhang PetscFunctionBegin; 118076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 119076ba34aSJunchao Zhang PetscFunctionReturn(0); 120076ba34aSJunchao Zhang } 121076ba34aSJunchao Zhang 122076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123076ba34aSJunchao Zhang { 124076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125076ba34aSJunchao Zhang 126076ba34aSJunchao Zhang PetscFunctionBegin; 1272328674fSJunchao Zhang if (aijkok) { 128076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 129076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1302328674fSJunchao Zhang } else { 1312328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1322328674fSJunchao Zhang } 133076ba34aSJunchao Zhang PetscFunctionReturn(0); 134076ba34aSJunchao Zhang } 135076ba34aSJunchao Zhang 136076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137076ba34aSJunchao Zhang { 138076ba34aSJunchao Zhang PetscFunctionBegin; 139076ba34aSJunchao Zhang *array = NULL; 140076ba34aSJunchao Zhang PetscFunctionReturn(0); 141076ba34aSJunchao Zhang } 142076ba34aSJunchao Zhang 143076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144076ba34aSJunchao Zhang { 145076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146076ba34aSJunchao Zhang 147076ba34aSJunchao Zhang PetscFunctionBegin; 1482328674fSJunchao Zhang if (aijkok) { 149076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1502328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1512328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1522328674fSJunchao Zhang } 153076ba34aSJunchao Zhang PetscFunctionReturn(0); 154076ba34aSJunchao Zhang } 155076ba34aSJunchao Zhang 156076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157076ba34aSJunchao Zhang { 158076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159076ba34aSJunchao Zhang 160076ba34aSJunchao Zhang PetscFunctionBegin; 1612328674fSJunchao Zhang if (aijkok) { 162076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 163076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1642328674fSJunchao Zhang } 165f0cf5187SStefano Zampini PetscFunctionReturn(0); 166f0cf5187SStefano Zampini } 167f0cf5187SStefano Zampini 168*7ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 169*7ee59b9bSJunchao Zhang { 170*7ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 171*7ee59b9bSJunchao Zhang 172*7ee59b9bSJunchao Zhang PetscFunctionBegin; 173*7ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 174*7ee59b9bSJunchao Zhang 175*7ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 176*7ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 177*7ee59b9bSJunchao Zhang if (a) { 178*7ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 179*7ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 180*7ee59b9bSJunchao Zhang } 181*7ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 182*7ee59b9bSJunchao Zhang PetscFunctionReturn(0); 183*7ee59b9bSJunchao Zhang } 184*7ee59b9bSJunchao Zhang 185a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 187a587d139SMark { 188042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 190a587d139SMark 191a587d139SMark PetscFunctionBegin; 1925f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 193152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 194a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 195a587d139SMark PetscFunctionReturn(0); 196a587d139SMark } 197a587d139SMark 198a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 199042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 200a587d139SMark { 201042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 202a587d139SMark 203a587d139SMark PetscFunctionBegin; 204a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 205a587d139SMark *d_mat = aijkok->device_mat_d.data(); 206a587d139SMark } else { 2079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 208a587d139SMark *d_mat = NULL; 209a587d139SMark } 210a587d139SMark PetscFunctionReturn(0); 211a587d139SMark } 212076ba34aSJunchao Zhang 213076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 214076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 215152b3e56SJunchao Zhang { 216152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 217152b3e56SJunchao Zhang 218152b3e56SJunchao Zhang PetscFunctionBegin; 2195f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 2229566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 2239566063dSJacob Faibussowitsch PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 2249566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 22586a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 226076ba34aSJunchao Zhang } 227076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 228152b3e56SJunchao Zhang PetscFunctionReturn(0); 229152b3e56SJunchao Zhang } 230152b3e56SJunchao Zhang 231076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 232076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 233152b3e56SJunchao Zhang { 234152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 235152b3e56SJunchao Zhang 236152b3e56SJunchao Zhang PetscFunctionBegin; 2379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2385f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 239076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 2409566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 2419566063dSJacob Faibussowitsch PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 2429566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 243076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 244076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 245076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 246076ba34aSJunchao Zhang #endif 24786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 248076ba34aSJunchao Zhang } 249076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 2509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 251152b3e56SJunchao Zhang PetscFunctionReturn(0); 252152b3e56SJunchao Zhang } 253a587d139SMark 2548c3ff71bSJunchao Zhang /* y = A x */ 2558c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2568c3ff71bSJunchao Zhang { 2578c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 258152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 259152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2608c3ff71bSJunchao Zhang 2618c3ff71bSJunchao Zhang PetscFunctionBegin; 2629566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2649566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 2659566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 2668c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 267152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 2689566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 2699566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 270076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 2719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 2729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2738c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2748c3ff71bSJunchao Zhang } 2758c3ff71bSJunchao Zhang 2768c3ff71bSJunchao Zhang /* y = A^T x */ 2778c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2788c3ff71bSJunchao Zhang { 2798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 280152b3e56SJunchao Zhang const char *mode; 281152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 282152b3e56SJunchao Zhang PetscScalarKokkosView yv; 283076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2848c3ff71bSJunchao Zhang 2858c3ff71bSJunchao Zhang PetscFunctionBegin; 2869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2889566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 2899566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 290152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 2919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 292152b3e56SJunchao Zhang mode = "N"; 293152b3e56SJunchao Zhang } else { 294076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 295076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 296152b3e56SJunchao Zhang mode = "T"; 297152b3e56SJunchao Zhang } 298076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 2999566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3009566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 3019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3048c3ff71bSJunchao Zhang } 3058c3ff71bSJunchao Zhang 3068c3ff71bSJunchao Zhang /* y = A^H x */ 3078c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 3088c3ff71bSJunchao Zhang { 3098c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 310152b3e56SJunchao Zhang const char *mode; 311152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 312152b3e56SJunchao Zhang PetscScalarKokkosView yv; 313076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3148c3ff71bSJunchao Zhang 3158c3ff71bSJunchao Zhang PetscFunctionBegin; 3169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3189566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3199566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 320152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 322152b3e56SJunchao Zhang mode = "N"; 323152b3e56SJunchao Zhang } else { 324076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 326152b3e56SJunchao Zhang mode = "C"; 327152b3e56SJunchao Zhang } 328076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 3299566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3309566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 3319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3338c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3348c3ff71bSJunchao Zhang } 3358c3ff71bSJunchao Zhang 3368c3ff71bSJunchao Zhang /* z = A x + y */ 3378c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3388c3ff71bSJunchao Zhang { 3398c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 340152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 341152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3428c3ff71bSJunchao Zhang 3438c3ff71bSJunchao Zhang PetscFunctionBegin; 3449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3469566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3479566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 3489566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 3498c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3508c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 351152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 3529566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3539566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 3549566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 3559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 3569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3578c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3588c3ff71bSJunchao Zhang } 3598c3ff71bSJunchao Zhang 3608c3ff71bSJunchao Zhang /* z = A^T x + y */ 3618c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3628c3ff71bSJunchao Zhang { 3638c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 364152b3e56SJunchao Zhang const char *mode; 365152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 366152b3e56SJunchao Zhang PetscScalarKokkosView zv; 367076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3688c3ff71bSJunchao Zhang 3698c3ff71bSJunchao Zhang PetscFunctionBegin; 3709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3729566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3739566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 3749566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 3758c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 376152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 378152b3e56SJunchao Zhang mode = "N"; 379152b3e56SJunchao Zhang } else { 380076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 381076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 382152b3e56SJunchao Zhang mode = "T"; 383152b3e56SJunchao Zhang } 384076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 3859566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3869566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 3879566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 3889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3908c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3918c3ff71bSJunchao Zhang } 3928c3ff71bSJunchao Zhang 3938c3ff71bSJunchao Zhang /* z = A^H x + y */ 3948c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3958c3ff71bSJunchao Zhang { 3968c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 397152b3e56SJunchao Zhang const char *mode; 398152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 399152b3e56SJunchao Zhang PetscScalarKokkosView zv; 400076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 4018c3ff71bSJunchao Zhang 4028c3ff71bSJunchao Zhang PetscFunctionBegin; 4039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4059566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 4069566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 4079566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 4088c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 409152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 411152b3e56SJunchao Zhang mode = "N"; 412152b3e56SJunchao Zhang } else { 413076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 414076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 415152b3e56SJunchao Zhang mode = "C"; 416152b3e56SJunchao Zhang } 417076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 4189566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 4199566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 4209566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 4219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 4229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 423152b3e56SJunchao Zhang PetscFunctionReturn(0); 424152b3e56SJunchao Zhang } 425152b3e56SJunchao Zhang 426152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 427152b3e56SJunchao Zhang { 428152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 429152b3e56SJunchao Zhang 430152b3e56SJunchao Zhang PetscFunctionBegin; 431152b3e56SJunchao Zhang switch (op) { 432152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 433152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 4349566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 435152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 436152b3e56SJunchao Zhang break; 437152b3e56SJunchao Zhang default: 4389566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 439152b3e56SJunchao Zhang break; 440152b3e56SJunchao Zhang } 4418c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4428c3ff71bSJunchao Zhang } 4438c3ff71bSJunchao Zhang 444076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4453d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4468c3ff71bSJunchao Zhang { 447076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4488c3ff71bSJunchao Zhang 4498c3ff71bSJunchao Zhang PetscFunctionBegin; 4509566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 451076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 4529566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 4538c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 4549566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 455076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4565f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 4579566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4589566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 4599566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 4609566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 461076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 462394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4635f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 464076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4658c3ff71bSJunchao Zhang } 466076ba34aSJunchao Zhang } 4678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4688c3ff71bSJunchao Zhang } 4698c3ff71bSJunchao Zhang 470076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 471076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 472076ba34aSJunchao Zhang */ 473076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4748c3ff71bSJunchao Zhang { 475076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 476076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 477076ba34aSJunchao Zhang Mat mat; 4788c3ff71bSJunchao Zhang 4798c3ff71bSJunchao Zhang PetscFunctionBegin; 480076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 4819566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 482076ba34aSJunchao Zhang mat = *B; 483076ba34aSJunchao Zhang if (A->assembled) { 484076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 485076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 486076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 487076ba34aSJunchao Zhang /* Now copy values to B if needed */ 488076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 489076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 490076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 491076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 492076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 493076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 494076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 495076ba34aSJunchao Zhang } 496076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 497076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 498076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 499076ba34aSJunchao Zhang } 500076ba34aSJunchao Zhang mat->spptr = bkok; 501076ba34aSJunchao Zhang } 502076ba34aSJunchao Zhang 5039566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5049566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 5059566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 5069566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5078c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5088c3ff71bSJunchao Zhang } 5098c3ff71bSJunchao Zhang 5100ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5110ecb592aSJunchao Zhang { 5120ecb592aSJunchao Zhang Mat At; 513ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5140ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5150ecb592aSJunchao Zhang 5160ecb592aSJunchao Zhang PetscFunctionBegin; 5179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 5180ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 519ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 5209566063dSJacob Faibussowitsch PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 5219566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 5220ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5239566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 5240ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5250ecb592aSJunchao Zhang if ((*B)->assembled) { 5260ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5279566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 5290ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5300ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5310ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5320ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5339566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 5349566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5350ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5360ecb592aSJunchao Zhang } 5370ecb592aSJunchao Zhang PetscFunctionReturn(0); 5380ecb592aSJunchao Zhang } 5390ecb592aSJunchao Zhang 5408c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5418c3ff71bSJunchao Zhang { 54286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5438c3ff71bSJunchao Zhang 5448c3ff71bSJunchao Zhang PetscFunctionBegin; 54586a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54686a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5478c3ff71bSJunchao Zhang delete aijkok; 54886a27549SJunchao Zhang } else { 54986a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 55086a27549SJunchao Zhang } 551cbc6b225SStefano Zampini A->spptr = NULL; 5529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 5539566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 5549566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 5559566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 5568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5578c3ff71bSJunchao Zhang } 5588c3ff71bSJunchao Zhang 5593f3ba80aSJunchao Zhang /*MC 5603f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5613f3ba80aSJunchao Zhang 5623f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5633f3ba80aSJunchao Zhang 5643f3ba80aSJunchao Zhang Options Database Keys: 5653f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 5663f3ba80aSJunchao Zhang 5673f3ba80aSJunchao Zhang Level: beginner 5683f3ba80aSJunchao Zhang 5693f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 5703f3ba80aSJunchao Zhang M*/ 57186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 57286a27549SJunchao Zhang { 57386a27549SJunchao Zhang PetscFunctionBegin; 5749566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 5759566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 5769566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 57786a27549SJunchao Zhang PetscFunctionReturn(0); 57886a27549SJunchao Zhang } 57986a27549SJunchao Zhang 580076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582a3f881fbSStefano Zampini { 583076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 584076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 585076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 586076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 587076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 588076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 589a3f881fbSStefano Zampini 590a3f881fbSStefano Zampini PetscFunctionBegin; 591076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 592076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 593076ba34aSJunchao Zhang PetscValidPointer(C,4); 594076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 595076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 5965f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5975f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 598076ba34aSJunchao Zhang 5999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 601076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 602076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 603076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 604076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 605076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 606076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 607076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 608076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 609076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 610076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 611076ba34aSJunchao Zhang nnz = a->nz + b->nz; 612076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 613076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 614076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 615076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 616076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 617076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 618076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 619076ba34aSJunchao Zhang ca = ca_dual.view_device(); 620076ba34aSJunchao Zhang ci = ci_dual.view_device(); 621076ba34aSJunchao Zhang cj = cj_dual.view_device(); 622076ba34aSJunchao Zhang 623076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 624076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 625076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 626076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 627076ba34aSJunchao Zhang 628076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 629076ba34aSJunchao Zhang ci(i) = coffset; 630076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 631076ba34aSJunchao Zhang }); 632076ba34aSJunchao Zhang 633076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634076ba34aSJunchao Zhang if (k < alen) { 635076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 636076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 637076ba34aSJunchao Zhang } else { 638076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 639076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 640076ba34aSJunchao Zhang } 641076ba34aSJunchao Zhang }); 642076ba34aSJunchao Zhang }); 643076ba34aSJunchao Zhang ca_dual.modify_device(); 644076ba34aSJunchao Zhang ci_dual.modify_device(); 645076ba34aSJunchao Zhang cj_dual.modify_device(); 6469566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 6479566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 648076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 649076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 650076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 651076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 652076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 653076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 654076ba34aSJunchao Zhang 655076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 656076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 657076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 658076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 659076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 660076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 661076ba34aSJunchao Zhang }); 662076ba34aSJunchao Zhang }); 6639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 664076ba34aSJunchao Zhang } 665076ba34aSJunchao Zhang PetscFunctionReturn(0); 666076ba34aSJunchao Zhang } 667076ba34aSJunchao Zhang 668076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 669076ba34aSJunchao Zhang { 670076ba34aSJunchao Zhang PetscFunctionBegin; 671076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 672a3f881fbSStefano Zampini PetscFunctionReturn(0); 673a3f881fbSStefano Zampini } 674a3f881fbSStefano Zampini 675a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 676a3f881fbSStefano Zampini { 677a3f881fbSStefano Zampini Mat_Product *product = C->product; 678a3f881fbSStefano Zampini Mat A,B; 679076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 680a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 681a3f881fbSStefano Zampini Mat_SeqAIJ *c; 682076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 683076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 684a3f881fbSStefano Zampini 685a3f881fbSStefano Zampini PetscFunctionBegin; 686a3f881fbSStefano Zampini MatCheckProduct(C,1); 6875f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 688076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 689076ba34aSJunchao Zhang 690076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 691076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 692076ba34aSJunchao Zhang PetscFunctionReturn(0); 693076ba34aSJunchao Zhang } 694076ba34aSJunchao Zhang 695076ba34aSJunchao Zhang switch (product->type) { 696076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 697076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 698076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 699076ba34aSJunchao Zhang default: 70098921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 701076ba34aSJunchao Zhang } 702076ba34aSJunchao Zhang 703a3f881fbSStefano Zampini A = product->A; 704a3f881fbSStefano Zampini B = product->B; 7059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 707a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 709a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 710076ba34aSJunchao Zhang 7115f80ce2aSJacob Faibussowitsch PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 712076ba34aSJunchao Zhang 713076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 714076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 715076ba34aSJunchao Zhang 716076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 717076ba34aSJunchao Zhang if (transA) { 7189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 719076ba34aSJunchao Zhang transA = false; 720a3f881fbSStefano Zampini } 721a3f881fbSStefano Zampini 722076ba34aSJunchao Zhang if (transB) { 7239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 724076ba34aSJunchao Zhang transB = false; 725076ba34aSJunchao Zhang } 7269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 7279566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 7289566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 7299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 7309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 731a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 732a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 7339566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 7349566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 7359566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 736a3f881fbSStefano Zampini c->reallocs = 0; 737076ba34aSJunchao Zhang C->info.mallocs = 0; 738a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 739a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 740a3f881fbSStefano Zampini C->num_ass++; 741a3f881fbSStefano Zampini PetscFunctionReturn(0); 742a3f881fbSStefano Zampini } 743a3f881fbSStefano Zampini 744a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 745a3f881fbSStefano Zampini { 746076ba34aSJunchao Zhang Mat_Product *product = C->product; 747076ba34aSJunchao Zhang MatProductType ptype; 748076ba34aSJunchao Zhang Mat A,B; 749076ba34aSJunchao Zhang bool transA,transB; 750076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 751076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 752076ba34aSJunchao Zhang MPI_Comm comm; 753076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 754a3f881fbSStefano Zampini 755a3f881fbSStefano Zampini PetscFunctionBegin; 756a3f881fbSStefano Zampini MatCheckProduct(C,1); 7579566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 7585f80ce2aSJacob Faibussowitsch PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 759a3f881fbSStefano Zampini A = product->A; 760a3f881fbSStefano Zampini B = product->B; 7619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 763a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 764a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 765076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 766076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 767076ba34aSJunchao Zhang 768a3f881fbSStefano Zampini ptype = product->type; 769a3f881fbSStefano Zampini switch (ptype) { 770076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 771076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 772076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 773a3f881fbSStefano Zampini default: 77498921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 775a3f881fbSStefano Zampini } 776a3f881fbSStefano Zampini 777076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 778076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 779076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 780076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 781a3f881fbSStefano Zampini 782076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 783076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7846ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7856ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7866ffb9508SJunchao Zhang */ 787076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 788076ba34aSJunchao Zhang 7899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 790076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 791076ba34aSJunchao Zhang if (transA) { 7929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 793076ba34aSJunchao Zhang transA = false; 794076ba34aSJunchao Zhang } 795076ba34aSJunchao Zhang 796076ba34aSJunchao Zhang if (transB) { 7979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 798076ba34aSJunchao Zhang transB = false; 799076ba34aSJunchao Zhang } 800076ba34aSJunchao Zhang 8019566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 802076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 803076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 804076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 805076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 806076ba34aSJunchao Zhang */ 8079566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 8089566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 8099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 810076ba34aSJunchao Zhang 8119566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 8129566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 813076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 814a3f881fbSStefano Zampini PetscFunctionReturn(0); 815a3f881fbSStefano Zampini } 816a3f881fbSStefano Zampini 817a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 818076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 819a3f881fbSStefano Zampini { 820076ba34aSJunchao Zhang Mat_Product *product = mat->product; 821a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822a3f881fbSStefano Zampini 823a3f881fbSStefano Zampini PetscFunctionBegin; 824a3f881fbSStefano Zampini MatCheckProduct(mat,1); 8259566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 826a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 8279566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 828a3f881fbSStefano Zampini } 829a3f881fbSStefano Zampini if (Biskok && Ciskok) { 830a3f881fbSStefano Zampini switch (product->type) { 831a3f881fbSStefano Zampini case MATPRODUCT_AB: 832a3f881fbSStefano Zampini case MATPRODUCT_AtB: 833a3f881fbSStefano Zampini case MATPRODUCT_ABt: 834a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835a3f881fbSStefano Zampini break; 836a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 837a3f881fbSStefano Zampini case MATPRODUCT_RARt: 838a3f881fbSStefano Zampini case MATPRODUCT_ABC: 839a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840a3f881fbSStefano Zampini break; 841a3f881fbSStefano Zampini default: 842a3f881fbSStefano Zampini break; 843a3f881fbSStefano Zampini } 844a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 8459566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 846a3f881fbSStefano Zampini } 847a3f881fbSStefano Zampini PetscFunctionReturn(0); 848a3f881fbSStefano Zampini } 849a587d139SMark 850f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851f0cf5187SStefano Zampini { 852f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 853f0cf5187SStefano Zampini 854f0cf5187SStefano Zampini PetscFunctionBegin; 8559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 857f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 858076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 8599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 8619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 862f0cf5187SStefano Zampini PetscFunctionReturn(0); 863f0cf5187SStefano Zampini } 864f0cf5187SStefano Zampini 865a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 866a587d139SMark { 867076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 868a587d139SMark 869a587d139SMark PetscFunctionBegin; 870076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8712328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 872076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 8739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8742328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8759566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 8762328674fSJunchao Zhang } 877a587d139SMark PetscFunctionReturn(0); 878a587d139SMark } 879a587d139SMark 880db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 88142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 882db78de30SJunchao Zhang { 883db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 884db78de30SJunchao Zhang 885db78de30SJunchao Zhang PetscFunctionBegin; 886db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887db78de30SJunchao Zhang PetscValidPointer(kv,2); 888db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 8899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 890db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 892db78de30SJunchao Zhang PetscFunctionReturn(0); 893db78de30SJunchao Zhang } 894db78de30SJunchao Zhang 89542550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 896db78de30SJunchao Zhang { 897db78de30SJunchao Zhang PetscFunctionBegin; 898db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899db78de30SJunchao Zhang PetscValidPointer(kv,2); 900db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901db78de30SJunchao Zhang PetscFunctionReturn(0); 902db78de30SJunchao Zhang } 903db78de30SJunchao Zhang 90442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 905db78de30SJunchao Zhang { 906db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 907db78de30SJunchao Zhang 908db78de30SJunchao Zhang PetscFunctionBegin; 909db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 910db78de30SJunchao Zhang PetscValidPointer(kv,2); 911db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 913db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 914076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 915db78de30SJunchao Zhang PetscFunctionReturn(0); 916db78de30SJunchao Zhang } 917db78de30SJunchao Zhang 91842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 919db78de30SJunchao Zhang { 920db78de30SJunchao Zhang PetscFunctionBegin; 921db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 922db78de30SJunchao Zhang PetscValidPointer(kv,2); 923db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 925db78de30SJunchao Zhang PetscFunctionReturn(0); 926db78de30SJunchao Zhang } 927db78de30SJunchao Zhang 92842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 929db78de30SJunchao Zhang { 930db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 931db78de30SJunchao Zhang 932db78de30SJunchao Zhang PetscFunctionBegin; 933db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 934db78de30SJunchao Zhang PetscValidPointer(kv,2); 935db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 936db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 937076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 938db78de30SJunchao Zhang PetscFunctionReturn(0); 939db78de30SJunchao Zhang } 940db78de30SJunchao Zhang 94142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 942db78de30SJunchao Zhang { 943db78de30SJunchao Zhang PetscFunctionBegin; 944db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945db78de30SJunchao Zhang PetscValidPointer(kv,2); 946db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 948db78de30SJunchao Zhang PetscFunctionReturn(0); 949db78de30SJunchao Zhang } 950db78de30SJunchao Zhang 951c17cf699SJunchao Zhang /* Computes Y += alpha X */ 952c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953a587d139SMark { 954a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 955c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 956c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 957c17cf699SJunchao Zhang MatScalarKokkosView Ya; 958a587d139SMark 959a587d139SMark PetscFunctionBegin; 960c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 961c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 9629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 9639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 9649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 965db78de30SJunchao Zhang 966c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 967c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 968a587d139SMark PetscBool e; 9699566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 970a587d139SMark if (e) { 9719566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 972c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 973a587d139SMark } 974a587d139SMark } 975db78de30SJunchao Zhang 976c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 977c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 978c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 979c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 980c17cf699SJunchao Zhang */ 981c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 982c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 983c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 984c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 985c17cf699SJunchao Zhang 986c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 987c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 9889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 989c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 990c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 991c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 992c17cf699SJunchao Zhang 993c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 994c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 995c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 996c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 997c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 998c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 999c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1000c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1001c17cf699SJunchao Zhang q++; 1002a587d139SMark } else { 1003c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1004c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1005c17cf699SJunchao Zhang */ 1006c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1007a587d139SMark } 1008c17cf699SJunchao Zhang } 1009c17cf699SJunchao Zhang }); 1010c17cf699SJunchao Zhang }); 10119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1012c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1013c17cf699SJunchao Zhang Mat Z; 1014c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1015c17cf699SJunchao Zhang KernelHandle kh; 1016c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1017c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1018c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1019c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 10209566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 10219566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y,&Z)); 1022c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1023c17cf699SJunchao Zhang } 10249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1026a587d139SMark PetscFunctionReturn(0); 1027a587d139SMark } 1028a587d139SMark 1029394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 103042550becSJunchao Zhang { 103142550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 103242550becSJunchao Zhang Mat_SeqAIJ *aseq; 103342550becSJunchao Zhang 103442550becSJunchao Zhang PetscFunctionBegin; 10359566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1036394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 103742550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1038cbc6b225SStefano Zampini delete akok; 1039cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 10409566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1041394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 104242550becSJunchao Zhang PetscFunctionReturn(0); 104342550becSJunchao Zhang } 104442550becSJunchao Zhang 104542550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 104642550becSJunchao Zhang { 104742550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 104842550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1049394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1050394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1051394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 105242550becSJunchao Zhang MatScalarKokkosView Aa; 105342550becSJunchao Zhang ConstMatScalarKokkosView kv; 105442550becSJunchao Zhang PetscMemType memtype; 105542550becSJunchao Zhang 105642550becSJunchao Zhang PetscFunctionBegin; 10579566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 105842550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1059394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 106042550becSJunchao Zhang } else { 1061394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 106242550becSJunchao Zhang } 106342550becSJunchao Zhang 1064cbc6b225SStefano Zampini if (imode == INSERT_VALUES) { 10659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1066cbc6b225SStefano Zampini Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 10679566063dSJacob Faibussowitsch } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 106842550becSJunchao Zhang 1069cbc6b225SStefano Zampini Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1070394ed5ebSJunchao Zhang 10719566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 10729566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 107342550becSJunchao Zhang PetscFunctionReturn(0); 107442550becSJunchao Zhang } 107542550becSJunchao Zhang 107686a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10778f7e8f9dSMark Adams { 10788f7e8f9dSMark Adams PetscFunctionBegin; 10799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 10809566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 10818f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10828f7e8f9dSMark Adams PetscFunctionReturn(0); 10838f7e8f9dSMark Adams } 10848f7e8f9dSMark Adams 10858c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10868c3ff71bSJunchao Zhang { 1087076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1088076ba34aSJunchao Zhang 10898c3ff71bSJunchao Zhang PetscFunctionBegin; 1090076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 10916f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 10926f3d89d0SStefano Zampini 10938c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10948c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10958c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1096a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1097f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1098a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1099076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11008c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11018c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11028c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11038c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11048c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11058c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1106076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11070ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1108152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1109076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1110076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1111076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1112076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1113076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1114076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1115*7ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 111642550becSJunchao Zhang 11179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 11189566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1119076ba34aSJunchao Zhang PetscFunctionReturn(0); 1120076ba34aSJunchao Zhang } 1121076ba34aSJunchao Zhang 1122076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1123076ba34aSJunchao Zhang { 1124076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1125076ba34aSJunchao Zhang PetscInt i,m,n; 1126076ba34aSJunchao Zhang 1127076ba34aSJunchao Zhang PetscFunctionBegin; 11285f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1129076ba34aSJunchao Zhang 1130076ba34aSJunchao Zhang m = akok->nrows(); 1131076ba34aSJunchao Zhang n = akok->ncols(); 11329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A,m,n,m,n)); 11339566063dSJacob Faibussowitsch PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1134076ba34aSJunchao Zhang 1135076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 11369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1137076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1138076ba34aSJunchao Zhang 1139076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1140076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1141076ba34aSJunchao Zhang 1142076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1143076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1144076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1145076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1146076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1147076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1148076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1149076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1150076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1151076ba34aSJunchao Zhang 11529566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&aseq->imax)); 11539566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&aseq->ilen)); 1154076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1155076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1156076ba34aSJunchao Zhang } 1157076ba34aSJunchao Zhang 1158076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1159076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1160ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 11619566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 11629566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1163076ba34aSJunchao Zhang PetscFunctionReturn(0); 1164076ba34aSJunchao Zhang } 1165076ba34aSJunchao Zhang 1166076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1167076ba34aSJunchao Zhang 1168076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1169076ba34aSJunchao Zhang */ 1170076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1171076ba34aSJunchao Zhang { 1172076ba34aSJunchao Zhang PetscFunctionBegin; 11739566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 11749566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 11758c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11768c3ff71bSJunchao Zhang } 11778c3ff71bSJunchao Zhang 11788c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1179152b3e56SJunchao Zhang /*@C 11808c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11818c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11828c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11838c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 11848c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 11858c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 11868c3ff71bSJunchao Zhang 11878c3ff71bSJunchao Zhang Collective 11888c3ff71bSJunchao Zhang 11898c3ff71bSJunchao Zhang Input Parameters: 11908c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 11918c3ff71bSJunchao Zhang . m - number of rows 11928c3ff71bSJunchao Zhang . n - number of columns 11938c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 11948c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 11958c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11968c3ff71bSJunchao Zhang 11978c3ff71bSJunchao Zhang Output Parameter: 11988c3ff71bSJunchao Zhang . A - the matrix 11998c3ff71bSJunchao Zhang 12008c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12018c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12028c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12038c3ff71bSJunchao Zhang 12048c3ff71bSJunchao Zhang Notes: 12058c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12068c3ff71bSJunchao Zhang 12078c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12088c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12098c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12108c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12118c3ff71bSJunchao Zhang 12128c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12138c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12148c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12158c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12168c3ff71bSJunchao Zhang 12178c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12188c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12198c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12208c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12218c3ff71bSJunchao Zhang 12228c3ff71bSJunchao Zhang Level: intermediate 12238c3ff71bSJunchao Zhang 1224076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 12258c3ff71bSJunchao Zhang @*/ 12268c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12278c3ff71bSJunchao Zhang { 12288c3ff71bSJunchao Zhang PetscFunctionBegin; 12299566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 12309566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 12319566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 12329566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 12339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 12348c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12358c3ff71bSJunchao Zhang } 1236930e68a5SMark Adams 12378f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12388f7e8f9dSMark Adams // 123946804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12408f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12418f7e8f9dSMark Adams // 12428f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1243930e68a5SMark Adams { 12448f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12458f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12468f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 12478f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1248300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1249076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1250076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12518f7e8f9dSMark Adams PetscBool row_identity,col_identity; 125246804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1253930e68a5SMark Adams 1254930e68a5SMark Adams PetscFunctionBegin; 12552c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12569566063dSJacob Faibussowitsch PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 12572c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12589566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r_h)); 12599566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol,&ic_h)); 12609566063dSJacob Faibussowitsch PetscCall(ISGetSize(isicol,&nc)); 12619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 12629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 12638f7e8f9dSMark Adams { 12648f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12658f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12668f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12678f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12688f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12698f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12708f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12718f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12728f7e8f9dSMark Adams size_t flops_h = 0.0; 12738f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12748f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12758f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12768f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12778f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12788f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12798f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12808f7e8f9dSMark Adams // Fill A --> fact 12818f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1282042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12838f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12848f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12858f7e8f9dSMark Adams // zero rows of B 12868f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12878f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12888f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12898f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12908f7e8f9dSMark Adams /* zero (unfactored row) */ 12918f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12928f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12938f7e8f9dSMark Adams }); 12948f7e8f9dSMark Adams // copy A into B 12958f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12968f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12978f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12988f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12998f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13008f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 13018f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13028f7e8f9dSMark Adams PetscScalar vala = av[j]; 13038f7e8f9dSMark Adams if (colb == rowb) { 13048f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13058f7e8f9dSMark Adams } else { 13068f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13078f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13088f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 13098f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 13108f7e8f9dSMark Adams if (pbj[j] == colb) { 13118f7e8f9dSMark Adams pba[j] = vala; 13128f7e8f9dSMark Adams set++; 13138f7e8f9dSMark Adams break; 13148f7e8f9dSMark Adams } 13158f7e8f9dSMark Adams } 13168f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13178f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13188f1da0b2SJunchao Zhang #endif 13198f7e8f9dSMark Adams } 13208f7e8f9dSMark Adams } 13218f7e8f9dSMark Adams }); 13228f7e8f9dSMark Adams }); 13238f7e8f9dSMark Adams Kokkos::fence(); 1324930e68a5SMark Adams 13258f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13268f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13278f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13288f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1329042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13308f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 13318f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13328f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13338f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 13348f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 13358f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 13368f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 13378f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13388f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13398f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13408f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13418f7e8f9dSMark Adams else { 13428f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13438f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13448f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13458f7e8f9dSMark Adams size_t st_idx; 13468f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13478f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13488f7e8f9dSMark Adams // get column, there has got to be a better way 13498f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13508f7e8f9dSMark Adams if (pjL[j] == ii) { 13518f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13528f7e8f9dSMark Adams idx = j; // output 13538f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13548f7e8f9dSMark Adams } 13558f7e8f9dSMark Adams }, st_idx); 13568f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13578f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 135899551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 135999551766SMark Adams #endif 136099551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13618f7e8f9dSMark Adams // U(i+1,:end) 13628f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13638f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13648f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13658f7e8f9dSMark Adams if (col==myk) { 13668f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13678f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13688f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13698f7e8f9dSMark Adams } else { 13708f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13718f7e8f9dSMark Adams PetscInt high, low; 13728f7e8f9dSMark Adams const PetscInt *startj; 13738f7e8f9dSMark Adams if (col<myk) { // L 13748f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13758f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13768f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13778f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13788f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13798f7e8f9dSMark Adams } else { 13808f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 13818f7e8f9dSMark Adams start = ba_d + idx; 13828f7e8f9dSMark Adams startj= bj_d + idx; 13838f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13848f7e8f9dSMark Adams } 13858f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13868f7e8f9dSMark Adams low = 0; 13878f7e8f9dSMark Adams high = (PetscInt)(end-start); 13888f7e8f9dSMark Adams while (high-low > 5) { 13898f7e8f9dSMark Adams int t = (low+high)/2; 13908f7e8f9dSMark Adams if (startj[t] > col) high = t; 13918f7e8f9dSMark Adams else low = t; 13928f7e8f9dSMark Adams } 13938f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13948f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13958f7e8f9dSMark Adams } 13968f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 139799551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 139899551766SMark Adams #endif 13998f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14008f7e8f9dSMark Adams } 14018f7e8f9dSMark Adams }); 14028f7e8f9dSMark Adams } 14038f7e8f9dSMark Adams }); 14048f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14058f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 14068f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 14078f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 14088f7e8f9dSMark Adams }); 14098f7e8f9dSMark Adams Kokkos::fence(); 14108f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 14119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 14128f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 14138f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 14148f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 14158f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14168f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 14178f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14188f7e8f9dSMark Adams if (i < end) { 14198f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14208f7e8f9dSMark Adams *pv = 1.0/(*pv); 14218f7e8f9dSMark Adams } 14228f7e8f9dSMark Adams }); 14238f7e8f9dSMark Adams }); 14248f7e8f9dSMark Adams } 14259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14269566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol,&ic_h)); 14279566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r_h)); 14288f7e8f9dSMark Adams 14299566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 14309566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol,&col_identity)); 14318f7e8f9dSMark Adams if (b->inode.size) { 14328f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14338f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14348f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14358f7e8f9dSMark Adams } else { 14368f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14378f7e8f9dSMark Adams } 14388f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 14408f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14418f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14428f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14438f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14448f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14458f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14468f7e8f9dSMark Adams 1447930e68a5SMark Adams PetscFunctionReturn(0); 1448930e68a5SMark Adams } 1449930e68a5SMark Adams 145086a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1451930e68a5SMark Adams { 1452930e68a5SMark Adams PetscFunctionBegin; 14539566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 145486a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 145586a27549SJunchao Zhang PetscFunctionReturn(0); 145686a27549SJunchao Zhang } 145786a27549SJunchao Zhang 145886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 145986a27549SJunchao Zhang { 146086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 146186a27549SJunchao Zhang 146286a27549SJunchao Zhang PetscFunctionBegin; 146386a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 146486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 146586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 146686a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 146786a27549SJunchao Zhang } 146886a27549SJunchao Zhang PetscFunctionReturn(0); 146986a27549SJunchao Zhang } 147086a27549SJunchao Zhang 147186a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 147286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 147386a27549SJunchao Zhang { 147486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1475076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 147686a27549SJunchao Zhang 147786a27549SJunchao Zhang PetscFunctionBegin; 147886a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 147986a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1480076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 148186a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1482076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1483076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 148486a27549SJunchao Zhang 148586a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1486076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1487076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1488076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 148986a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 149086a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 149186a27549SJunchao Zhang 149286a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 149386a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 149486a27549SJunchao Zhang */ 1495076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1496076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 149786a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 149886a27549SJunchao Zhang 149986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 150086a27549SJunchao Zhang 150186a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1502076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 150386a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1504076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1505076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 150686a27549SJunchao Zhang 150786a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1508076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1509076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1510076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 151186a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 151286a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 151386a27549SJunchao Zhang 151486a27549SJunchao Zhang /* Sort indices. See comments above */ 1515076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1516076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 151786a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 151886a27549SJunchao Zhang 151986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 152086a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 152186a27549SJunchao Zhang } 152286a27549SJunchao Zhang PetscFunctionReturn(0); 152386a27549SJunchao Zhang } 152486a27549SJunchao Zhang 152586a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 152686a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 152786a27549SJunchao Zhang { 152886a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 152986a27549SJunchao Zhang PetscScalarKokkosView xv; 153086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 153186a27549SJunchao Zhang 153286a27549SJunchao Zhang PetscFunctionBegin; 15339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 15359566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b,&bv)); 15369566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x,&xv)); 153786a27549SJunchao Zhang /* Solve L tmpv = b */ 15389566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 153986a27549SJunchao Zhang /* Solve Ux = tmpv */ 15409566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 15419566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b,&bv)); 15429566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 15439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 154486a27549SJunchao Zhang PetscFunctionReturn(0); 154586a27549SJunchao Zhang } 154686a27549SJunchao Zhang 1547076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 154886a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 154986a27549SJunchao Zhang { 155086a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 155186a27549SJunchao Zhang PetscScalarKokkosView xv; 155286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 155386a27549SJunchao Zhang 155486a27549SJunchao Zhang PetscFunctionBegin; 15559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 15579566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b,&bv)); 15589566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x,&xv)); 155986a27549SJunchao Zhang /* Solve U^T tmpv = b */ 156086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 156186a27549SJunchao Zhang 156286a27549SJunchao Zhang /* Solve L^T x = tmpv */ 156386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 15649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b,&bv)); 15659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 15669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 156786a27549SJunchao Zhang PetscFunctionReturn(0); 156886a27549SJunchao Zhang } 156986a27549SJunchao Zhang 157086a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 157186a27549SJunchao Zhang { 157286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 157386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 157486a27549SJunchao Zhang PetscInt fill_lev = info->levels; 157586a27549SJunchao Zhang 157686a27549SJunchao Zhang PetscFunctionBegin; 15779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1579076ba34aSJunchao Zhang 1580076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1581076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1582076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1583076ba34aSJunchao Zhang 1584076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 158586a27549SJunchao Zhang 158686a27549SJunchao Zhang B->assembled = PETSC_TRUE; 158786a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 158886a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 158986a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 159086a27549SJunchao Zhang B->ops->matsolve = NULL; 159186a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 159286a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 159386a27549SJunchao Zhang 159486a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 159586a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 159686a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1597eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 15989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 159986a27549SJunchao Zhang PetscFunctionReturn(0); 160086a27549SJunchao Zhang } 160186a27549SJunchao Zhang 160286a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 160386a27549SJunchao Zhang { 160486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 160586a27549SJunchao Zhang Mat_SeqAIJ *b; 160686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 160786a27549SJunchao Zhang PetscInt fill_lev = info->levels; 160886a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 160986a27549SJunchao Zhang PetscInt n = A->rmap->n; 161086a27549SJunchao Zhang 161186a27549SJunchao Zhang PetscFunctionBegin; 16129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 161386a27549SJunchao Zhang /* Rebuild factors */ 161486a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 161586a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 161686a27549SJunchao Zhang 161786a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 161886a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 161986a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 162086a27549SJunchao Zhang 162186a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 162286a27549SJunchao Zhang 162386a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 162486a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 162586a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 162686a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 162786a27549SJunchao Zhang 162886a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1629076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1630076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1631076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 163286a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 163386a27549SJunchao Zhang 163486a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 163586a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 163686a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 163786a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 163886a27549SJunchao Zhang 163986a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 164086a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 164186a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 164286a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 164386a27549SJunchao Zhang #else 164486a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 164586a27549SJunchao Zhang #endif 164686a27549SJunchao Zhang 164786a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 164886a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 164986a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 165086a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 165186a27549SJunchao Zhang 165286a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 16539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 165486a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 165586a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 165686a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 165786a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 165886a27549SJunchao Zhang 165986a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 166086a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1661930e68a5SMark Adams PetscFunctionReturn(0); 1662930e68a5SMark Adams } 1663930e68a5SMark Adams 16648f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16658f7e8f9dSMark Adams { 16668f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16678f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1668930e68a5SMark Adams 16698f7e8f9dSMark Adams PetscFunctionBegin; 16709566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 16718f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16728f7e8f9dSMark Adams // move B data into Kokkos 16739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 16749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 16758f7e8f9dSMark Adams { 16768f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1677300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 16788f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1679300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1680300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 16818f7e8f9dSMark Adams } 16828f7e8f9dSMark Adams } 16838f7e8f9dSMark Adams PetscFunctionReturn(0); 16848f7e8f9dSMark Adams } 16858f7e8f9dSMark Adams 168686a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1687930e68a5SMark Adams { 1688930e68a5SMark Adams PetscFunctionBegin; 1689930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1690930e68a5SMark Adams PetscFunctionReturn(0); 1691930e68a5SMark Adams } 1692930e68a5SMark Adams 16938f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16948f7e8f9dSMark Adams { 16958f7e8f9dSMark Adams PetscFunctionBegin; 16968f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16978f7e8f9dSMark Adams PetscFunctionReturn(0); 16988f7e8f9dSMark Adams } 16998f7e8f9dSMark Adams 1700930e68a5SMark Adams /*MC 170186a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 170286a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1703930e68a5SMark Adams 1704930e68a5SMark Adams Level: beginner 1705930e68a5SMark Adams 17063f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1707930e68a5SMark Adams M*/ 170886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1709930e68a5SMark Adams { 1710930e68a5SMark Adams PetscInt n = A->rmap->n; 1711930e68a5SMark Adams 1712930e68a5SMark Adams PetscFunctionBegin; 17139566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 17149566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 1715930e68a5SMark Adams (*B)->factortype = ftype; 17169566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 17179566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1718930e68a5SMark Adams 17198f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17209566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 172186a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 172286a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 172386a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 17249566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 172586a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 172686a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 172798921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1728930e68a5SMark Adams 17299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 17309566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1731930e68a5SMark Adams PetscFunctionReturn(0); 1732930e68a5SMark Adams } 17338f7e8f9dSMark Adams 17348f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17358f7e8f9dSMark Adams { 17368f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17378f7e8f9dSMark Adams 17388f7e8f9dSMark Adams PetscFunctionBegin; 17399566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 17409566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 17418f7e8f9dSMark Adams (*B)->factortype = ftype; 1742f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17439566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 17449566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 17458f7e8f9dSMark Adams 17468f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17479566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 17488f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17498f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17508f7e8f9dSMark Adams 17519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 17529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 17538f7e8f9dSMark Adams PetscFunctionReturn(0); 17548f7e8f9dSMark Adams } 175586a27549SJunchao Zhang 175686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 175786a27549SJunchao Zhang { 175886a27549SJunchao Zhang PetscFunctionBegin; 17599566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 17609566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 17619566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 176286a27549SJunchao Zhang PetscFunctionReturn(0); 176386a27549SJunchao Zhang } 176486a27549SJunchao Zhang 1765076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1766076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1767076ba34aSJunchao Zhang { 1768076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1769076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1770076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1771076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1772076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1773076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1774076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1775076ba34aSJunchao Zhang 1776076ba34aSJunchao Zhang PetscFunctionBegin; 17779566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1778076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 17799566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1780076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 17819566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p])); 1782076ba34aSJunchao Zhang } 17839566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1784076ba34aSJunchao Zhang } 1785076ba34aSJunchao Zhang PetscFunctionReturn(0); 1786076ba34aSJunchao Zhang } 1787