111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 612c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 622c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 632c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 792c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 842328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8586a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8686a27549SJunchao Zhang PetscFunctionReturn(0); 8786a27549SJunchao Zhang } 8886a27549SJunchao Zhang 89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90f0cf5187SStefano Zampini { 91f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92f0cf5187SStefano Zampini 93f0cf5187SStefano Zampini PetscFunctionBegin; 94f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 962c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 972c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 99f0cf5187SStefano Zampini PetscFunctionReturn(0); 100f0cf5187SStefano Zampini } 101f0cf5187SStefano Zampini 102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103f0cf5187SStefano Zampini { 104076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105f0cf5187SStefano Zampini 106f0cf5187SStefano Zampini PetscFunctionBegin; 107076ba34aSJunchao Zhang if (aijkok) { 108076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 109076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 110076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 111076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112076ba34aSJunchao Zhang } 113076ba34aSJunchao Zhang PetscFunctionReturn(0); 114076ba34aSJunchao Zhang } 115076ba34aSJunchao Zhang 116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117076ba34aSJunchao Zhang { 118076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119076ba34aSJunchao Zhang 120076ba34aSJunchao Zhang PetscFunctionBegin; 121076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 122076ba34aSJunchao Zhang PetscFunctionReturn(0); 123076ba34aSJunchao Zhang } 124076ba34aSJunchao Zhang 125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126076ba34aSJunchao Zhang { 127076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128076ba34aSJunchao Zhang 129076ba34aSJunchao Zhang PetscFunctionBegin; 1302328674fSJunchao Zhang if (aijkok) { 131076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 132076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1332328674fSJunchao Zhang } else { 1342328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1352328674fSJunchao Zhang } 136076ba34aSJunchao Zhang PetscFunctionReturn(0); 137076ba34aSJunchao Zhang } 138076ba34aSJunchao Zhang 139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140076ba34aSJunchao Zhang { 141076ba34aSJunchao Zhang PetscFunctionBegin; 142076ba34aSJunchao Zhang *array = NULL; 143076ba34aSJunchao Zhang PetscFunctionReturn(0); 144076ba34aSJunchao Zhang } 145076ba34aSJunchao Zhang 146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147076ba34aSJunchao Zhang { 148076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang PetscFunctionBegin; 1512328674fSJunchao Zhang if (aijkok) { 152076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1532328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1542328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1552328674fSJunchao Zhang } 156076ba34aSJunchao Zhang PetscFunctionReturn(0); 157076ba34aSJunchao Zhang } 158076ba34aSJunchao Zhang 159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160076ba34aSJunchao Zhang { 161076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162076ba34aSJunchao Zhang 163076ba34aSJunchao Zhang PetscFunctionBegin; 1642328674fSJunchao Zhang if (aijkok) { 165076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 166076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1672328674fSJunchao Zhang } 168f0cf5187SStefano Zampini PetscFunctionReturn(0); 169f0cf5187SStefano Zampini } 170f0cf5187SStefano Zampini 171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173a587d139SMark { 174042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176a587d139SMark 177a587d139SMark PetscFunctionBegin; 1782c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181a587d139SMark PetscFunctionReturn(0); 182a587d139SMark } 183a587d139SMark 184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186a587d139SMark { 187042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188a587d139SMark 189a587d139SMark PetscFunctionBegin; 190a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 191a587d139SMark *d_mat = aijkok->device_mat_d.data(); 192a587d139SMark } else { 193a587d139SMark PetscErrorCode ierr; 194a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195a587d139SMark *d_mat = NULL; 196a587d139SMark } 197a587d139SMark PetscFunctionReturn(0); 198a587d139SMark } 199076ba34aSJunchao Zhang 200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202152b3e56SJunchao Zhang { 203152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204152b3e56SJunchao Zhang 205152b3e56SJunchao Zhang PetscFunctionBegin; 2062c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 210076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 213076ba34aSJunchao Zhang } 214076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 215152b3e56SJunchao Zhang PetscFunctionReturn(0); 216152b3e56SJunchao Zhang } 217152b3e56SJunchao Zhang 218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220152b3e56SJunchao Zhang { 221eeadb341SJunchao Zhang PetscErrorCode ierr; 222152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223152b3e56SJunchao Zhang 224152b3e56SJunchao Zhang PetscFunctionBegin; 225eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2262c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 229076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 232076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 233076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234076ba34aSJunchao Zhang #endif 23586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 236076ba34aSJunchao Zhang } 237076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 238eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239152b3e56SJunchao Zhang PetscFunctionReturn(0); 240152b3e56SJunchao Zhang } 241a587d139SMark 2428c3ff71bSJunchao Zhang /* y = A x */ 2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2448c3ff71bSJunchao Zhang { 2458c3ff71bSJunchao Zhang PetscErrorCode ierr; 2468c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 247152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 248152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang PetscFunctionBegin; 251*6af1d01cSJed Brown ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2528c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 2558c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 259076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 261*6af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2638c3ff71bSJunchao Zhang } 2648c3ff71bSJunchao Zhang 2658c3ff71bSJunchao Zhang /* y = A^T x */ 2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2678c3ff71bSJunchao Zhang { 2688c3ff71bSJunchao Zhang PetscErrorCode ierr; 2698c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 270152b3e56SJunchao Zhang const char *mode; 271152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 272152b3e56SJunchao Zhang PetscScalarKokkosView yv; 273076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2748c3ff71bSJunchao Zhang 2758c3ff71bSJunchao Zhang PetscFunctionBegin; 276eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 280152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 281076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282152b3e56SJunchao Zhang mode = "N"; 283152b3e56SJunchao Zhang } else { 284076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 286152b3e56SJunchao Zhang mode = "T"; 287152b3e56SJunchao Zhang } 288076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 291076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2938c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2948c3ff71bSJunchao Zhang } 2958c3ff71bSJunchao Zhang 2968c3ff71bSJunchao Zhang /* y = A^H x */ 2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2988c3ff71bSJunchao Zhang { 2998c3ff71bSJunchao Zhang PetscErrorCode ierr; 3008c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 301152b3e56SJunchao Zhang const char *mode; 302152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 303152b3e56SJunchao Zhang PetscScalarKokkosView yv; 304076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3058c3ff71bSJunchao Zhang 3068c3ff71bSJunchao Zhang PetscFunctionBegin; 307eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 311152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 312076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313152b3e56SJunchao Zhang mode = "N"; 314152b3e56SJunchao Zhang } else { 315076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 317152b3e56SJunchao Zhang mode = "C"; 318152b3e56SJunchao Zhang } 319076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 322076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3248c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3258c3ff71bSJunchao Zhang } 3268c3ff71bSJunchao Zhang 3278c3ff71bSJunchao Zhang /* z = A x + y */ 3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3298c3ff71bSJunchao Zhang { 3308c3ff71bSJunchao Zhang PetscErrorCode ierr; 3318c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 332152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 333152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3348c3ff71bSJunchao Zhang 3358c3ff71bSJunchao Zhang PetscFunctionBegin; 336eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3418c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3428c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3508c3ff71bSJunchao Zhang } 3518c3ff71bSJunchao Zhang 3528c3ff71bSJunchao Zhang /* z = A^T x + y */ 3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3548c3ff71bSJunchao Zhang { 3558c3ff71bSJunchao Zhang PetscErrorCode ierr; 3568c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 357152b3e56SJunchao Zhang const char *mode; 358152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 359152b3e56SJunchao Zhang PetscScalarKokkosView zv; 360076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3618c3ff71bSJunchao Zhang 3628c3ff71bSJunchao Zhang PetscFunctionBegin; 363eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3688c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 369152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 370076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371152b3e56SJunchao Zhang mode = "N"; 372152b3e56SJunchao Zhang } else { 373076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 375152b3e56SJunchao Zhang mode = "T"; 376152b3e56SJunchao Zhang } 377076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 381076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3838c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3848c3ff71bSJunchao Zhang } 3858c3ff71bSJunchao Zhang 3868c3ff71bSJunchao Zhang /* z = A^H x + y */ 3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3888c3ff71bSJunchao Zhang { 3898c3ff71bSJunchao Zhang PetscErrorCode ierr; 3908c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 391152b3e56SJunchao Zhang const char *mode; 392152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 393152b3e56SJunchao Zhang PetscScalarKokkosView zv; 394076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3958c3ff71bSJunchao Zhang 3968c3ff71bSJunchao Zhang PetscFunctionBegin; 397eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3988c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 4028c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 403152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 404076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405152b3e56SJunchao Zhang mode = "N"; 406152b3e56SJunchao Zhang } else { 407076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 409152b3e56SJunchao Zhang mode = "C"; 410152b3e56SJunchao Zhang } 411076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 415076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417152b3e56SJunchao Zhang PetscFunctionReturn(0); 418152b3e56SJunchao Zhang } 419152b3e56SJunchao Zhang 420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421152b3e56SJunchao Zhang { 422152b3e56SJunchao Zhang PetscErrorCode ierr; 423152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424152b3e56SJunchao Zhang 425152b3e56SJunchao Zhang PetscFunctionBegin; 426152b3e56SJunchao Zhang switch (op) { 427152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 428152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 431152b3e56SJunchao Zhang break; 432152b3e56SJunchao Zhang default: 433152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434152b3e56SJunchao Zhang break; 435152b3e56SJunchao Zhang } 4368c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4378c3ff71bSJunchao Zhang } 4388c3ff71bSJunchao Zhang 439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4418c3ff71bSJunchao Zhang { 4428c3ff71bSJunchao Zhang PetscErrorCode ierr; 443076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4448c3ff71bSJunchao Zhang 4458c3ff71bSJunchao Zhang PetscFunctionBegin; 446a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4498c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4522c71b3e2SJacob Faibussowitsch PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 458394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4592c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4618c3ff71bSJunchao Zhang } 462076ba34aSJunchao Zhang } 4638c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4648c3ff71bSJunchao Zhang } 4658c3ff71bSJunchao Zhang 466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468076ba34aSJunchao Zhang */ 469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4708c3ff71bSJunchao Zhang { 4718c3ff71bSJunchao Zhang PetscErrorCode ierr; 472076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 473076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474076ba34aSJunchao Zhang Mat mat; 4758c3ff71bSJunchao Zhang 4768c3ff71bSJunchao Zhang PetscFunctionBegin; 477076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479076ba34aSJunchao Zhang mat = *B; 480076ba34aSJunchao Zhang if (A->assembled) { 481076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484076ba34aSJunchao Zhang /* Now copy values to B if needed */ 485076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 486076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 487076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 489076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 490076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 492076ba34aSJunchao Zhang } 493076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 496076ba34aSJunchao Zhang } 497076ba34aSJunchao Zhang mat->spptr = bkok; 498076ba34aSJunchao Zhang } 499076ba34aSJunchao Zhang 500076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 5048c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5058c3ff71bSJunchao Zhang } 5068c3ff71bSJunchao Zhang 5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5080ecb592aSJunchao Zhang { 5090ecb592aSJunchao Zhang PetscErrorCode ierr; 5100ecb592aSJunchao Zhang Mat At; 511ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5120ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5130ecb592aSJunchao Zhang 5140ecb592aSJunchao Zhang PetscFunctionBegin; 5150ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5160ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 518ff751488SJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 5190ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5200ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5217a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5220ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5230ecb592aSJunchao Zhang if ((*B)->assembled) { 5240ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5250ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5260ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5270ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5280ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5290ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5300ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5310ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5320ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5330ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5340ecb592aSJunchao Zhang } 5350ecb592aSJunchao Zhang PetscFunctionReturn(0); 5360ecb592aSJunchao Zhang } 5370ecb592aSJunchao Zhang 5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5398c3ff71bSJunchao Zhang { 5408c3ff71bSJunchao Zhang PetscErrorCode ierr; 54186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5428c3ff71bSJunchao Zhang 5438c3ff71bSJunchao Zhang PetscFunctionBegin; 54486a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54586a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5468c3ff71bSJunchao Zhang delete aijkok; 54786a27549SJunchao Zhang } else { 54886a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 54986a27549SJunchao Zhang } 550152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 55142550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 55242550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 553152b3e56SJunchao Zhang A->spptr = NULL; 5548c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5558c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5568c3ff71bSJunchao Zhang } 5578c3ff71bSJunchao Zhang 55886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 55986a27549SJunchao Zhang { 56086a27549SJunchao Zhang PetscErrorCode ierr; 56186a27549SJunchao Zhang 56286a27549SJunchao Zhang PetscFunctionBegin; 56386a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 56486a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 56586a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 56686a27549SJunchao Zhang PetscFunctionReturn(0); 56786a27549SJunchao Zhang } 56886a27549SJunchao Zhang 569076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 570076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 571a3f881fbSStefano Zampini { 572076ba34aSJunchao Zhang PetscErrorCode ierr; 573076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 574076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 575076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 576076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 577076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 578076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 579a3f881fbSStefano Zampini 580a3f881fbSStefano Zampini PetscFunctionBegin; 581076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 582076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 583076ba34aSJunchao Zhang PetscValidPointer(C,4); 584076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 585076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 5862c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5872c71b3e2SJacob Faibussowitsch PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 588076ba34aSJunchao Zhang 589076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 590076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 591076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 592076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 593076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 594076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 595076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 596076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 597076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 598076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 599076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 600076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 601076ba34aSJunchao Zhang nnz = a->nz + b->nz; 602076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 603076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 604076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 605076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 606076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 607076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 608076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 609076ba34aSJunchao Zhang ca = ca_dual.view_device(); 610076ba34aSJunchao Zhang ci = ci_dual.view_device(); 611076ba34aSJunchao Zhang cj = cj_dual.view_device(); 612076ba34aSJunchao Zhang 613076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 614076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 615076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 616076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 617076ba34aSJunchao Zhang 618076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 619076ba34aSJunchao Zhang ci(i) = coffset; 620076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 621076ba34aSJunchao Zhang }); 622076ba34aSJunchao Zhang 623076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 624076ba34aSJunchao Zhang if (k < alen) { 625076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 626076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 627076ba34aSJunchao Zhang } else { 628076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 629076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 630076ba34aSJunchao Zhang } 631076ba34aSJunchao Zhang }); 632076ba34aSJunchao Zhang }); 633076ba34aSJunchao Zhang ca_dual.modify_device(); 634076ba34aSJunchao Zhang ci_dual.modify_device(); 635076ba34aSJunchao Zhang cj_dual.modify_device(); 636076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 637076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 638076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 639076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 640076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 641076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 642076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 643076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 644076ba34aSJunchao Zhang 645076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 646076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 647076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 648076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 649076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 650076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 651076ba34aSJunchao Zhang }); 652076ba34aSJunchao Zhang }); 653076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 654076ba34aSJunchao Zhang } 655076ba34aSJunchao Zhang PetscFunctionReturn(0); 656076ba34aSJunchao Zhang } 657076ba34aSJunchao Zhang 658076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 659076ba34aSJunchao Zhang { 660076ba34aSJunchao Zhang PetscFunctionBegin; 661076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 662a3f881fbSStefano Zampini PetscFunctionReturn(0); 663a3f881fbSStefano Zampini } 664a3f881fbSStefano Zampini 665a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 666a3f881fbSStefano Zampini { 667076ba34aSJunchao Zhang PetscErrorCode ierr; 668a3f881fbSStefano Zampini Mat_Product *product = C->product; 669a3f881fbSStefano Zampini Mat A,B; 670076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 671a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 672a3f881fbSStefano Zampini Mat_SeqAIJ *c; 673076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 674076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 675a3f881fbSStefano Zampini 676a3f881fbSStefano Zampini PetscFunctionBegin; 677a3f881fbSStefano Zampini MatCheckProduct(C,1); 6782c71b3e2SJacob Faibussowitsch PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 679076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 680076ba34aSJunchao Zhang 681076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 682076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 683076ba34aSJunchao Zhang PetscFunctionReturn(0); 684076ba34aSJunchao Zhang } 685076ba34aSJunchao Zhang 686076ba34aSJunchao Zhang switch (product->type) { 687076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 688076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 689076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 690076ba34aSJunchao Zhang default: 69198921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 692076ba34aSJunchao Zhang } 693076ba34aSJunchao Zhang 694a3f881fbSStefano Zampini A = product->A; 695a3f881fbSStefano Zampini B = product->B; 696a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 697a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 698a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 699a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 700a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 701076ba34aSJunchao Zhang 7022c71b3e2SJacob Faibussowitsch PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 703076ba34aSJunchao Zhang 704076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 705076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 706076ba34aSJunchao Zhang 707076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 708076ba34aSJunchao Zhang if (transA) { 709076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 710076ba34aSJunchao Zhang transA = false; 711a3f881fbSStefano Zampini } 712a3f881fbSStefano Zampini 713076ba34aSJunchao Zhang if (transB) { 714076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 715076ba34aSJunchao Zhang transB = false; 716076ba34aSJunchao Zhang } 717eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 718076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 719076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 720eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 721076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 722a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 723a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 7247d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 725a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 7267d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 727a3f881fbSStefano Zampini c->reallocs = 0; 728076ba34aSJunchao Zhang C->info.mallocs = 0; 729a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 730a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 731a3f881fbSStefano Zampini C->num_ass++; 732a3f881fbSStefano Zampini PetscFunctionReturn(0); 733a3f881fbSStefano Zampini } 734a3f881fbSStefano Zampini 735a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 736a3f881fbSStefano Zampini { 737a3f881fbSStefano Zampini PetscErrorCode ierr; 738076ba34aSJunchao Zhang Mat_Product *product = C->product; 739076ba34aSJunchao Zhang MatProductType ptype; 740076ba34aSJunchao Zhang Mat A,B; 741076ba34aSJunchao Zhang bool transA,transB; 742076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 743076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 744076ba34aSJunchao Zhang MPI_Comm comm; 745076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 746a3f881fbSStefano Zampini 747a3f881fbSStefano Zampini PetscFunctionBegin; 748a3f881fbSStefano Zampini MatCheckProduct(C,1); 749076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 7502c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 751a3f881fbSStefano Zampini A = product->A; 752a3f881fbSStefano Zampini B = product->B; 753a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 754a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 755a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 756a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 757076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 758076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 759076ba34aSJunchao Zhang 760a3f881fbSStefano Zampini ptype = product->type; 761a3f881fbSStefano Zampini switch (ptype) { 762076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 763076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 764076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 765a3f881fbSStefano Zampini default: 76698921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 767a3f881fbSStefano Zampini } 768a3f881fbSStefano Zampini 769076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 770076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 771076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 772076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 773a3f881fbSStefano Zampini 774076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 775076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7766ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7776ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7786ffb9508SJunchao Zhang */ 779076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 780076ba34aSJunchao Zhang 781eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 782076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 783076ba34aSJunchao Zhang if (transA) { 784076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 785076ba34aSJunchao Zhang transA = false; 786076ba34aSJunchao Zhang } 787076ba34aSJunchao Zhang 788076ba34aSJunchao Zhang if (transB) { 789076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 790076ba34aSJunchao Zhang transB = false; 791076ba34aSJunchao Zhang } 792076ba34aSJunchao Zhang 793076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 794076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 795076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 796076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 797076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 798076ba34aSJunchao Zhang */ 799076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 800076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 801eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 802076ba34aSJunchao Zhang 803076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 804076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 805076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 806a3f881fbSStefano Zampini PetscFunctionReturn(0); 807a3f881fbSStefano Zampini } 808a3f881fbSStefano Zampini 809a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 810076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 811a3f881fbSStefano Zampini { 812a3f881fbSStefano Zampini PetscErrorCode ierr; 813076ba34aSJunchao Zhang Mat_Product *product = mat->product; 814a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 815a3f881fbSStefano Zampini 816a3f881fbSStefano Zampini PetscFunctionBegin; 817a3f881fbSStefano Zampini MatCheckProduct(mat,1); 818a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 819a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 820a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 821a3f881fbSStefano Zampini } 822a3f881fbSStefano Zampini if (Biskok && Ciskok) { 823a3f881fbSStefano Zampini switch (product->type) { 824a3f881fbSStefano Zampini case MATPRODUCT_AB: 825a3f881fbSStefano Zampini case MATPRODUCT_AtB: 826a3f881fbSStefano Zampini case MATPRODUCT_ABt: 827a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 828a3f881fbSStefano Zampini break; 829a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 830a3f881fbSStefano Zampini case MATPRODUCT_RARt: 831a3f881fbSStefano Zampini case MATPRODUCT_ABC: 832a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 833a3f881fbSStefano Zampini break; 834a3f881fbSStefano Zampini default: 835a3f881fbSStefano Zampini break; 836a3f881fbSStefano Zampini } 837a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 838a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 839a3f881fbSStefano Zampini } 840a3f881fbSStefano Zampini PetscFunctionReturn(0); 841a3f881fbSStefano Zampini } 842a587d139SMark 843f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 844f0cf5187SStefano Zampini { 845f0cf5187SStefano Zampini PetscErrorCode ierr; 846f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 847f0cf5187SStefano Zampini 848f0cf5187SStefano Zampini PetscFunctionBegin; 849eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 850f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 851f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 852076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 853076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 855*6af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 856f0cf5187SStefano Zampini PetscFunctionReturn(0); 857f0cf5187SStefano Zampini } 858f0cf5187SStefano Zampini 859a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 860a587d139SMark { 861a587d139SMark PetscErrorCode ierr; 862076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 863a587d139SMark 864a587d139SMark PetscFunctionBegin; 865076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8662328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 867076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 868076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8692328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8702328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8712328674fSJunchao Zhang } 872a587d139SMark PetscFunctionReturn(0); 873a587d139SMark } 874a587d139SMark 875db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 87642550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 877db78de30SJunchao Zhang { 878db78de30SJunchao Zhang PetscErrorCode ierr; 879db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 880db78de30SJunchao Zhang 881db78de30SJunchao Zhang PetscFunctionBegin; 882db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 883db78de30SJunchao Zhang PetscValidPointer(kv,2); 884db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 885db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 886db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 888db78de30SJunchao Zhang PetscFunctionReturn(0); 889db78de30SJunchao Zhang } 890db78de30SJunchao Zhang 89142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 892db78de30SJunchao Zhang { 893db78de30SJunchao Zhang PetscFunctionBegin; 894db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 895db78de30SJunchao Zhang PetscValidPointer(kv,2); 896db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 897db78de30SJunchao Zhang PetscFunctionReturn(0); 898db78de30SJunchao Zhang } 899db78de30SJunchao Zhang 90042550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 901db78de30SJunchao Zhang { 902db78de30SJunchao Zhang PetscErrorCode ierr; 903db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 904db78de30SJunchao Zhang 905db78de30SJunchao Zhang PetscFunctionBegin; 906db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 907db78de30SJunchao Zhang PetscValidPointer(kv,2); 908db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 909db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 910db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 911076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 912db78de30SJunchao Zhang PetscFunctionReturn(0); 913db78de30SJunchao Zhang } 914db78de30SJunchao Zhang 91542550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 916db78de30SJunchao Zhang { 917db78de30SJunchao Zhang PetscErrorCode ierr; 918db78de30SJunchao Zhang 919db78de30SJunchao Zhang PetscFunctionBegin; 920db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 921db78de30SJunchao Zhang PetscValidPointer(kv,2); 922db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 923076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 924db78de30SJunchao Zhang PetscFunctionReturn(0); 925db78de30SJunchao Zhang } 926db78de30SJunchao Zhang 92742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 928db78de30SJunchao Zhang { 929db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 930db78de30SJunchao Zhang 931db78de30SJunchao Zhang PetscFunctionBegin; 932db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 933db78de30SJunchao Zhang PetscValidPointer(kv,2); 934db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 935db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 936076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 937db78de30SJunchao Zhang PetscFunctionReturn(0); 938db78de30SJunchao Zhang } 939db78de30SJunchao Zhang 94042550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 941db78de30SJunchao Zhang { 942db78de30SJunchao Zhang PetscErrorCode ierr; 943db78de30SJunchao Zhang 944db78de30SJunchao Zhang PetscFunctionBegin; 945db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 946db78de30SJunchao Zhang PetscValidPointer(kv,2); 947db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 948076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 949db78de30SJunchao Zhang PetscFunctionReturn(0); 950db78de30SJunchao Zhang } 951db78de30SJunchao Zhang 952c17cf699SJunchao Zhang /* Computes Y += alpha X */ 953c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 954a587d139SMark { 955a587d139SMark PetscErrorCode ierr; 956a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 957c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 958c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 959c17cf699SJunchao Zhang MatScalarKokkosView Ya; 960a587d139SMark 961a587d139SMark PetscFunctionBegin; 962c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 963c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 964c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 965c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 966eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 967db78de30SJunchao Zhang 968c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 969c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 970a587d139SMark PetscBool e; 971a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 972a587d139SMark if (e) { 973a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 974c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 975a587d139SMark } 976a587d139SMark } 977db78de30SJunchao Zhang 978c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 979c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 980c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 981c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 982c17cf699SJunchao Zhang */ 983c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 984c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 985c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 986c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 987c17cf699SJunchao Zhang 988c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 989c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 990c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 991c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 992c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 993c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 994c17cf699SJunchao Zhang 995c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 996c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 997c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 998c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 999c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1000c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1001c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1002c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1003c17cf699SJunchao Zhang q++; 1004a587d139SMark } else { 1005c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1006c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1007c17cf699SJunchao Zhang */ 1008c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1009a587d139SMark } 1010c17cf699SJunchao Zhang } 1011c17cf699SJunchao Zhang }); 1012c17cf699SJunchao Zhang }); 1013c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1014c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1015c17cf699SJunchao Zhang Mat Z; 1016c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1017c17cf699SJunchao Zhang KernelHandle kh; 1018c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1019c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1020c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1021c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1022c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1023c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1024c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1025c17cf699SJunchao Zhang } 1026eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1027eeadb341SJunchao Zhang ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1028a587d139SMark PetscFunctionReturn(0); 1029a587d139SMark } 1030a587d139SMark 1031394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 103242550becSJunchao Zhang { 103342550becSJunchao Zhang PetscErrorCode ierr; 1034394ed5ebSJunchao Zhang Mat newmat; 103542550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 103642550becSJunchao Zhang Mat_SeqAIJ *aseq; 103742550becSJunchao Zhang 103842550becSJunchao Zhang PetscFunctionBegin; 1039394ed5ebSJunchao Zhang ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1040394ed5ebSJunchao Zhang ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1041394ed5ebSJunchao Zhang ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1042394ed5ebSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 104342550becSJunchao Zhang ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 104442550becSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 104542550becSJunchao Zhang ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1046394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 104742550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1048394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 104942550becSJunchao Zhang PetscFunctionReturn(0); 105042550becSJunchao Zhang } 105142550becSJunchao Zhang 105242550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 105342550becSJunchao Zhang { 105442550becSJunchao Zhang PetscErrorCode ierr; 105542550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 105642550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1057394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1058394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1059394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 106042550becSJunchao Zhang MatScalarKokkosView Aa; 106142550becSJunchao Zhang ConstMatScalarKokkosView kv; 106242550becSJunchao Zhang PetscMemType memtype; 106342550becSJunchao Zhang 106442550becSJunchao Zhang PetscFunctionBegin; 1065394ed5ebSJunchao Zhang PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 106642550becSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 106742550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1068394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 106942550becSJunchao Zhang } else { 1070394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 107142550becSJunchao Zhang } 107242550becSJunchao Zhang 1073394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1074394ed5ebSJunchao Zhang else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 107542550becSJunchao Zhang 1076394ed5ebSJunchao Zhang Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1077b6c38306SJunchao Zhang PetscScalar sum = 0.0; 1078b6c38306SJunchao Zhang for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1079b6c38306SJunchao Zhang Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 108042550becSJunchao Zhang }); 1081394ed5ebSJunchao Zhang 1082394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1083394ed5ebSJunchao Zhang else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 108442550becSJunchao Zhang PetscFunctionReturn(0); 108542550becSJunchao Zhang } 108642550becSJunchao Zhang 108786a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10888f7e8f9dSMark Adams { 10898f7e8f9dSMark Adams PetscErrorCode ierr; 10908f7e8f9dSMark Adams 10918f7e8f9dSMark Adams PetscFunctionBegin; 10928f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 10938f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 10948f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10958f7e8f9dSMark Adams PetscFunctionReturn(0); 10968f7e8f9dSMark Adams } 10978f7e8f9dSMark Adams 10988c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10998c3ff71bSJunchao Zhang { 110042550becSJunchao Zhang PetscErrorCode ierr; 1101076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1102076ba34aSJunchao Zhang 11038c3ff71bSJunchao Zhang PetscFunctionBegin; 1104076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 11056f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 11066f3d89d0SStefano Zampini 11078c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 11088c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 11098c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1110a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1111f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1112a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1113076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11148c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11158c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11168c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11178c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11188c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11198c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1120076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11210ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1122152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1123076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1124076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1125076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1126076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1127076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1128076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 112942550becSJunchao Zhang 113042550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 113142550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1132076ba34aSJunchao Zhang PetscFunctionReturn(0); 1133076ba34aSJunchao Zhang } 1134076ba34aSJunchao Zhang 1135076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1136076ba34aSJunchao Zhang { 1137076ba34aSJunchao Zhang PetscErrorCode ierr; 1138076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1139076ba34aSJunchao Zhang PetscInt i,m,n; 1140076ba34aSJunchao Zhang 1141076ba34aSJunchao Zhang PetscFunctionBegin; 11422c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1143076ba34aSJunchao Zhang 1144076ba34aSJunchao Zhang m = akok->nrows(); 1145076ba34aSJunchao Zhang n = akok->ncols(); 1146076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1147076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1148076ba34aSJunchao Zhang 1149076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1150076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1151076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1152076ba34aSJunchao Zhang 1153076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1154076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1155076ba34aSJunchao Zhang 1156076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1157076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1158076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1159076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1160076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1161076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1162076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1163076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1164076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1165076ba34aSJunchao Zhang 1166076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1167076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1168076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1169076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1170076ba34aSJunchao Zhang } 1171076ba34aSJunchao Zhang 1172076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1173076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1174ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1175076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1176076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1177076ba34aSJunchao Zhang PetscFunctionReturn(0); 1178076ba34aSJunchao Zhang } 1179076ba34aSJunchao Zhang 1180076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1181076ba34aSJunchao Zhang 1182076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1183076ba34aSJunchao Zhang */ 1184076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1185076ba34aSJunchao Zhang { 1186076ba34aSJunchao Zhang PetscErrorCode ierr; 1187076ba34aSJunchao Zhang 1188076ba34aSJunchao Zhang PetscFunctionBegin; 1189076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1190076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 11918c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11928c3ff71bSJunchao Zhang } 11938c3ff71bSJunchao Zhang 11948c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1195152b3e56SJunchao Zhang /*@C 11968c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11978c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11988c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11998c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12008c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12018c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 12028c3ff71bSJunchao Zhang 12038c3ff71bSJunchao Zhang Collective 12048c3ff71bSJunchao Zhang 12058c3ff71bSJunchao Zhang Input Parameters: 12068c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 12078c3ff71bSJunchao Zhang . m - number of rows 12088c3ff71bSJunchao Zhang . n - number of columns 12098c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 12108c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 12118c3ff71bSJunchao Zhang (possibly different for each row) or NULL 12128c3ff71bSJunchao Zhang 12138c3ff71bSJunchao Zhang Output Parameter: 12148c3ff71bSJunchao Zhang . A - the matrix 12158c3ff71bSJunchao Zhang 12168c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12178c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12188c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12198c3ff71bSJunchao Zhang 12208c3ff71bSJunchao Zhang Notes: 12218c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12228c3ff71bSJunchao Zhang 12238c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12248c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12258c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12268c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12278c3ff71bSJunchao Zhang 12288c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12298c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12308c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12318c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12328c3ff71bSJunchao Zhang 12338c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12348c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12358c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12368c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12378c3ff71bSJunchao Zhang 12388c3ff71bSJunchao Zhang Level: intermediate 12398c3ff71bSJunchao Zhang 1240076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 12418c3ff71bSJunchao Zhang @*/ 12428c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12438c3ff71bSJunchao Zhang { 12448c3ff71bSJunchao Zhang PetscErrorCode ierr; 12458c3ff71bSJunchao Zhang 12468c3ff71bSJunchao Zhang PetscFunctionBegin; 12478c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 12488c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 12498c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 12508c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 12518c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 12528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12538c3ff71bSJunchao Zhang } 1254930e68a5SMark Adams 12558f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12568f7e8f9dSMark Adams // 125746804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12588f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12598f7e8f9dSMark Adams // 12608f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1261930e68a5SMark Adams { 12628f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12638f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12648f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1265930e68a5SMark Adams PetscErrorCode ierr; 12668f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1267300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1268076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1269076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12708f7e8f9dSMark Adams PetscBool row_identity,col_identity; 127146804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1272930e68a5SMark Adams 1273930e68a5SMark Adams PetscFunctionBegin; 12742c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12758f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 12762c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12778f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12788f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12798f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12808f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12818f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12828f7e8f9dSMark Adams { 12838f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12848f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12858f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12868f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12878f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12888f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12898f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12908f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12918f7e8f9dSMark Adams size_t flops_h = 0.0; 12928f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12938f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12948f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12958f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12968f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12978f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12988f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12998f7e8f9dSMark Adams // Fill A --> fact 13008f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1301042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 13028f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 13038f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13048f7e8f9dSMark Adams // zero rows of B 13058f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13068f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 13078f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 13088f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 13098f7e8f9dSMark Adams /* zero (unfactored row) */ 13108f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 13118f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 13128f7e8f9dSMark Adams }); 13138f7e8f9dSMark Adams // copy A into B 13148f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13158f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 13168f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 13178f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 13188f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13198f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 13208f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13218f7e8f9dSMark Adams PetscScalar vala = av[j]; 13228f7e8f9dSMark Adams if (colb == rowb) { 13238f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13248f7e8f9dSMark Adams } else { 13258f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13268f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13278f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 13288f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 13298f7e8f9dSMark Adams if (pbj[j] == colb) { 13308f7e8f9dSMark Adams pba[j] = vala; 13318f7e8f9dSMark Adams set++; 13328f7e8f9dSMark Adams break; 13338f7e8f9dSMark Adams } 13348f7e8f9dSMark Adams } 13358f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13368f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13378f1da0b2SJunchao Zhang #endif 13388f7e8f9dSMark Adams } 13398f7e8f9dSMark Adams } 13408f7e8f9dSMark Adams }); 13418f7e8f9dSMark Adams }); 13428f7e8f9dSMark Adams Kokkos::fence(); 1343930e68a5SMark Adams 13448f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13458f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13468f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13478f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1348042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13498f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 13508f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13518f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13528f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 13538f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 13548f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 13558f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 13568f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13578f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13588f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13598f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13608f7e8f9dSMark Adams else { 13618f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13628f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13638f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13648f7e8f9dSMark Adams size_t st_idx; 13658f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13668f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13678f7e8f9dSMark Adams // get column, there has got to be a better way 13688f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13698f7e8f9dSMark Adams if (pjL[j] == ii) { 13708f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13718f7e8f9dSMark Adams idx = j; // output 13728f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13738f7e8f9dSMark Adams } 13748f7e8f9dSMark Adams }, st_idx); 13758f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13768f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 137799551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 137899551766SMark Adams #endif 137999551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13808f7e8f9dSMark Adams // U(i+1,:end) 13818f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13828f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13838f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13848f7e8f9dSMark Adams if (col==myk) { 13858f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13868f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13878f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13888f7e8f9dSMark Adams } else { 13898f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13908f7e8f9dSMark Adams PetscInt high, low; 13918f7e8f9dSMark Adams const PetscInt *startj; 13928f7e8f9dSMark Adams if (col<myk) { // L 13938f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13948f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13958f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13968f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13978f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13988f7e8f9dSMark Adams } else { 13998f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 14008f7e8f9dSMark Adams start = ba_d + idx; 14018f7e8f9dSMark Adams startj= bj_d + idx; 14028f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 14038f7e8f9dSMark Adams } 14048f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 14058f7e8f9dSMark Adams low = 0; 14068f7e8f9dSMark Adams high = (PetscInt)(end-start); 14078f7e8f9dSMark Adams while (high-low > 5) { 14088f7e8f9dSMark Adams int t = (low+high)/2; 14098f7e8f9dSMark Adams if (startj[t] > col) high = t; 14108f7e8f9dSMark Adams else low = t; 14118f7e8f9dSMark Adams } 14128f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 14138f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 14148f7e8f9dSMark Adams } 14158f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 141699551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 141799551766SMark Adams #endif 14188f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14198f7e8f9dSMark Adams } 14208f7e8f9dSMark Adams }); 14218f7e8f9dSMark Adams } 14228f7e8f9dSMark Adams }); 14238f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14248f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 14258f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 14268f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 14278f7e8f9dSMark Adams }); 14288f7e8f9dSMark Adams Kokkos::fence(); 14298f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 14308f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 14318f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 14328f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 14338f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 14348f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14358f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 14368f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14378f7e8f9dSMark Adams if (i < end) { 14388f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14398f7e8f9dSMark Adams *pv = 1.0/(*pv); 14408f7e8f9dSMark Adams } 14418f7e8f9dSMark Adams }); 14428f7e8f9dSMark Adams }); 14438f7e8f9dSMark Adams } 14448f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 14458f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 14468f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 14478f7e8f9dSMark Adams 14488f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 14498f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 14508f7e8f9dSMark Adams if (b->inode.size) { 14518f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14528f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14538f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14548f7e8f9dSMark Adams } else { 14558f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14568f7e8f9dSMark Adams } 14578f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14588f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 14598f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14608f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14618f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14628f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14638f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14648f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14658f7e8f9dSMark Adams 1466930e68a5SMark Adams PetscFunctionReturn(0); 1467930e68a5SMark Adams } 1468930e68a5SMark Adams 146986a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1470930e68a5SMark Adams { 1471930e68a5SMark Adams PetscErrorCode ierr; 1472930e68a5SMark Adams 1473930e68a5SMark Adams PetscFunctionBegin; 1474930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 147586a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 147686a27549SJunchao Zhang PetscFunctionReturn(0); 147786a27549SJunchao Zhang } 147886a27549SJunchao Zhang 147986a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 148086a27549SJunchao Zhang { 148186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 148286a27549SJunchao Zhang 148386a27549SJunchao Zhang PetscFunctionBegin; 148486a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 148586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 148686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 148786a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 148886a27549SJunchao Zhang } 148986a27549SJunchao Zhang PetscFunctionReturn(0); 149086a27549SJunchao Zhang } 149186a27549SJunchao Zhang 149286a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 149386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 149486a27549SJunchao Zhang { 149586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1496076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 149786a27549SJunchao Zhang 149886a27549SJunchao Zhang PetscFunctionBegin; 149986a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 150086a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1501076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 150286a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1503076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1504076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 150586a27549SJunchao Zhang 150686a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1507076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1508076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1509076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 151086a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 151186a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 151286a27549SJunchao Zhang 151386a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 151486a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 151586a27549SJunchao Zhang */ 1516076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1517076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 151886a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 151986a27549SJunchao Zhang 152086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 152186a27549SJunchao Zhang 152286a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1523076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 152486a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1525076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1526076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 152786a27549SJunchao Zhang 152886a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1529076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1530076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1531076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 153286a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 153386a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 153486a27549SJunchao Zhang 153586a27549SJunchao Zhang /* Sort indices. See comments above */ 1536076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1537076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 153886a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 153986a27549SJunchao Zhang 154086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 154186a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 154286a27549SJunchao Zhang } 154386a27549SJunchao Zhang PetscFunctionReturn(0); 154486a27549SJunchao Zhang } 154586a27549SJunchao Zhang 154686a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 154786a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 154886a27549SJunchao Zhang { 154986a27549SJunchao Zhang PetscErrorCode ierr; 155086a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 155186a27549SJunchao Zhang PetscScalarKokkosView xv; 155286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 155386a27549SJunchao Zhang 155486a27549SJunchao Zhang PetscFunctionBegin; 1555eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155686a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 155786a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1558ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 155986a27549SJunchao Zhang /* Solve L tmpv = b */ 15603f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 156186a27549SJunchao Zhang /* Solve Ux = tmpv */ 15623f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 156386a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1564ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1565eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156686a27549SJunchao Zhang PetscFunctionReturn(0); 156786a27549SJunchao Zhang } 156886a27549SJunchao Zhang 1569076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 157086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 157186a27549SJunchao Zhang { 157286a27549SJunchao Zhang PetscErrorCode ierr; 157386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 157486a27549SJunchao Zhang PetscScalarKokkosView xv; 157586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 157686a27549SJunchao Zhang 157786a27549SJunchao Zhang PetscFunctionBegin; 1578eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 157986a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 158086a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1581ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 158286a27549SJunchao Zhang /* Solve U^T tmpv = b */ 158386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 158486a27549SJunchao Zhang 158586a27549SJunchao Zhang /* Solve L^T x = tmpv */ 158686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 158786a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1588ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1589eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159086a27549SJunchao Zhang PetscFunctionReturn(0); 159186a27549SJunchao Zhang } 159286a27549SJunchao Zhang 159386a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 159486a27549SJunchao Zhang { 159586a27549SJunchao Zhang PetscErrorCode ierr; 159686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 159786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 159886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 159986a27549SJunchao Zhang 160086a27549SJunchao Zhang PetscFunctionBegin; 1601eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 160286a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1603076ba34aSJunchao Zhang 1604076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1605076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1606076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1607076ba34aSJunchao Zhang 1608076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 160986a27549SJunchao Zhang 161086a27549SJunchao Zhang B->assembled = PETSC_TRUE; 161186a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 161286a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 161386a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 161486a27549SJunchao Zhang B->ops->matsolve = NULL; 161586a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 161686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 161786a27549SJunchao Zhang 161886a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 161986a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 162086a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1621eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1622eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 162386a27549SJunchao Zhang PetscFunctionReturn(0); 162486a27549SJunchao Zhang } 162586a27549SJunchao Zhang 162686a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 162786a27549SJunchao Zhang { 162886a27549SJunchao Zhang PetscErrorCode ierr; 162986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 163086a27549SJunchao Zhang Mat_SeqAIJ *b; 163186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 163286a27549SJunchao Zhang PetscInt fill_lev = info->levels; 163386a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 163486a27549SJunchao Zhang PetscInt n = A->rmap->n; 163586a27549SJunchao Zhang 163686a27549SJunchao Zhang PetscFunctionBegin; 163786a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 163886a27549SJunchao Zhang /* Rebuild factors */ 163986a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 164086a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 164186a27549SJunchao Zhang 164286a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 164386a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 164486a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 164586a27549SJunchao Zhang 164686a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 164786a27549SJunchao Zhang 164886a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 164986a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 165086a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 165186a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 165286a27549SJunchao Zhang 165386a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1654076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1655076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1656076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 165786a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 165886a27549SJunchao Zhang 165986a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 166086a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 166186a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 166286a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 166386a27549SJunchao Zhang 166486a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 166586a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 166686a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 166786a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 166886a27549SJunchao Zhang #else 166986a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 167086a27549SJunchao Zhang #endif 167186a27549SJunchao Zhang 167286a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 167386a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 167486a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 167586a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 167686a27549SJunchao Zhang 167786a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 167886a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 167986a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 168086a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 168186a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 168286a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 168386a27549SJunchao Zhang 168486a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 168586a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1686930e68a5SMark Adams PetscFunctionReturn(0); 1687930e68a5SMark Adams } 1688930e68a5SMark Adams 16898f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16908f7e8f9dSMark Adams { 16918f7e8f9dSMark Adams PetscErrorCode ierr; 16928f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16938f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1694930e68a5SMark Adams 16958f7e8f9dSMark Adams PetscFunctionBegin; 16968f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 16978f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16988f7e8f9dSMark Adams // move B data into Kokkos 16998f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 17008f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 17018f7e8f9dSMark Adams { 17028f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1703300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17048f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1705300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1706300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 17078f7e8f9dSMark Adams } 17088f7e8f9dSMark Adams } 17098f7e8f9dSMark Adams PetscFunctionReturn(0); 17108f7e8f9dSMark Adams } 17118f7e8f9dSMark Adams 171286a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1713930e68a5SMark Adams { 1714930e68a5SMark Adams PetscFunctionBegin; 1715930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1716930e68a5SMark Adams PetscFunctionReturn(0); 1717930e68a5SMark Adams } 1718930e68a5SMark Adams 17198f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 17208f7e8f9dSMark Adams { 17218f7e8f9dSMark Adams PetscFunctionBegin; 17228f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 17238f7e8f9dSMark Adams PetscFunctionReturn(0); 17248f7e8f9dSMark Adams } 17258f7e8f9dSMark Adams 1726930e68a5SMark Adams /*MC 172786a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 172886a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1729930e68a5SMark Adams 1730930e68a5SMark Adams Level: beginner 1731930e68a5SMark Adams 173286a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1733930e68a5SMark Adams M*/ 173486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1735930e68a5SMark Adams { 1736930e68a5SMark Adams PetscErrorCode ierr; 1737930e68a5SMark Adams PetscInt n = A->rmap->n; 1738930e68a5SMark Adams 1739930e68a5SMark Adams PetscFunctionBegin; 1740930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1741930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1742930e68a5SMark Adams (*B)->factortype = ftype; 17434ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1744930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1745930e68a5SMark Adams 17468f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1747930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 174886a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 174986a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 175086a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 175186a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 175286a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 175386a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 175498921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1755930e68a5SMark Adams 1756930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 175786a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1758930e68a5SMark Adams PetscFunctionReturn(0); 1759930e68a5SMark Adams } 17608f7e8f9dSMark Adams 17618f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17628f7e8f9dSMark Adams { 17638f7e8f9dSMark Adams PetscErrorCode ierr; 17648f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17658f7e8f9dSMark Adams 17668f7e8f9dSMark Adams PetscFunctionBegin; 17678f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 17688f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 17698f7e8f9dSMark Adams (*B)->factortype = ftype; 1770f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17714ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 17728f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17738f7e8f9dSMark Adams 17748f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17758f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17768f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17778f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17788f7e8f9dSMark Adams 17798f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17808f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17818f7e8f9dSMark Adams PetscFunctionReturn(0); 17828f7e8f9dSMark Adams } 178386a27549SJunchao Zhang 178486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 178586a27549SJunchao Zhang { 178686a27549SJunchao Zhang PetscErrorCode ierr; 178786a27549SJunchao Zhang 178886a27549SJunchao Zhang PetscFunctionBegin; 178986a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 179086a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 179186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 179286a27549SJunchao Zhang PetscFunctionReturn(0); 179386a27549SJunchao Zhang } 179486a27549SJunchao Zhang 1795076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1796076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1797076ba34aSJunchao Zhang { 1798076ba34aSJunchao Zhang PetscErrorCode ierr; 1799076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1800076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1801076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1802076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1803076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1804076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1805076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1806076ba34aSJunchao Zhang 1807076ba34aSJunchao Zhang PetscFunctionBegin; 1808c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1809076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1810c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1811076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1812c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1813076ba34aSJunchao Zhang } 1814076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1815076ba34aSJunchao Zhang } 1816076ba34aSJunchao Zhang PetscFunctionReturn(0); 1817076ba34aSJunchao Zhang } 1818