111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 612c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 622c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 632c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 792c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 842328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8586a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8686a27549SJunchao Zhang PetscFunctionReturn(0); 8786a27549SJunchao Zhang } 8886a27549SJunchao Zhang 89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90f0cf5187SStefano Zampini { 91f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92f0cf5187SStefano Zampini 93f0cf5187SStefano Zampini PetscFunctionBegin; 94f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 962c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 972c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 99f0cf5187SStefano Zampini PetscFunctionReturn(0); 100f0cf5187SStefano Zampini } 101f0cf5187SStefano Zampini 102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103f0cf5187SStefano Zampini { 104076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105f0cf5187SStefano Zampini 106f0cf5187SStefano Zampini PetscFunctionBegin; 107076ba34aSJunchao Zhang if (aijkok) { 108076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 109076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 110076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 111076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112076ba34aSJunchao Zhang } 113076ba34aSJunchao Zhang PetscFunctionReturn(0); 114076ba34aSJunchao Zhang } 115076ba34aSJunchao Zhang 116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117076ba34aSJunchao Zhang { 118076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119076ba34aSJunchao Zhang 120076ba34aSJunchao Zhang PetscFunctionBegin; 121076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 122076ba34aSJunchao Zhang PetscFunctionReturn(0); 123076ba34aSJunchao Zhang } 124076ba34aSJunchao Zhang 125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126076ba34aSJunchao Zhang { 127076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128076ba34aSJunchao Zhang 129076ba34aSJunchao Zhang PetscFunctionBegin; 1302328674fSJunchao Zhang if (aijkok) { 131076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 132076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1332328674fSJunchao Zhang } else { 1342328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1352328674fSJunchao Zhang } 136076ba34aSJunchao Zhang PetscFunctionReturn(0); 137076ba34aSJunchao Zhang } 138076ba34aSJunchao Zhang 139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140076ba34aSJunchao Zhang { 141076ba34aSJunchao Zhang PetscFunctionBegin; 142076ba34aSJunchao Zhang *array = NULL; 143076ba34aSJunchao Zhang PetscFunctionReturn(0); 144076ba34aSJunchao Zhang } 145076ba34aSJunchao Zhang 146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147076ba34aSJunchao Zhang { 148076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang PetscFunctionBegin; 1512328674fSJunchao Zhang if (aijkok) { 152076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1532328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1542328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1552328674fSJunchao Zhang } 156076ba34aSJunchao Zhang PetscFunctionReturn(0); 157076ba34aSJunchao Zhang } 158076ba34aSJunchao Zhang 159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160076ba34aSJunchao Zhang { 161076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162076ba34aSJunchao Zhang 163076ba34aSJunchao Zhang PetscFunctionBegin; 1642328674fSJunchao Zhang if (aijkok) { 165076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 166076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1672328674fSJunchao Zhang } 168f0cf5187SStefano Zampini PetscFunctionReturn(0); 169f0cf5187SStefano Zampini } 170f0cf5187SStefano Zampini 171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173a587d139SMark { 174042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176a587d139SMark 177a587d139SMark PetscFunctionBegin; 1782c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181a587d139SMark PetscFunctionReturn(0); 182a587d139SMark } 183a587d139SMark 184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186a587d139SMark { 187042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188a587d139SMark 189a587d139SMark PetscFunctionBegin; 190a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 191a587d139SMark *d_mat = aijkok->device_mat_d.data(); 192a587d139SMark } else { 193a587d139SMark PetscErrorCode ierr; 194a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195a587d139SMark *d_mat = NULL; 196a587d139SMark } 197a587d139SMark PetscFunctionReturn(0); 198a587d139SMark } 199076ba34aSJunchao Zhang 200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202152b3e56SJunchao Zhang { 203152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204152b3e56SJunchao Zhang 205152b3e56SJunchao Zhang PetscFunctionBegin; 2062c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 210076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 213076ba34aSJunchao Zhang } 214076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 215152b3e56SJunchao Zhang PetscFunctionReturn(0); 216152b3e56SJunchao Zhang } 217152b3e56SJunchao Zhang 218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220152b3e56SJunchao Zhang { 221eeadb341SJunchao Zhang PetscErrorCode ierr; 222152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223152b3e56SJunchao Zhang 224152b3e56SJunchao Zhang PetscFunctionBegin; 225eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2262c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 229076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 232076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 233076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234076ba34aSJunchao Zhang #endif 23586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 236076ba34aSJunchao Zhang } 237076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 238eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239152b3e56SJunchao Zhang PetscFunctionReturn(0); 240152b3e56SJunchao Zhang } 241a587d139SMark 2428c3ff71bSJunchao Zhang /* y = A x */ 2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2448c3ff71bSJunchao Zhang { 2458c3ff71bSJunchao Zhang PetscErrorCode ierr; 2468c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 247152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 248152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang PetscFunctionBegin; 2516af1d01cSJed Brown ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2528c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 2558c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 259076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2616af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2638c3ff71bSJunchao Zhang } 2648c3ff71bSJunchao Zhang 2658c3ff71bSJunchao Zhang /* y = A^T x */ 2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2678c3ff71bSJunchao Zhang { 2688c3ff71bSJunchao Zhang PetscErrorCode ierr; 2698c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 270152b3e56SJunchao Zhang const char *mode; 271152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 272152b3e56SJunchao Zhang PetscScalarKokkosView yv; 273076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2748c3ff71bSJunchao Zhang 2758c3ff71bSJunchao Zhang PetscFunctionBegin; 276eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 280152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 281076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282152b3e56SJunchao Zhang mode = "N"; 283152b3e56SJunchao Zhang } else { 284076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 286152b3e56SJunchao Zhang mode = "T"; 287152b3e56SJunchao Zhang } 288076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 291076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2938c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2948c3ff71bSJunchao Zhang } 2958c3ff71bSJunchao Zhang 2968c3ff71bSJunchao Zhang /* y = A^H x */ 2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2988c3ff71bSJunchao Zhang { 2998c3ff71bSJunchao Zhang PetscErrorCode ierr; 3008c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 301152b3e56SJunchao Zhang const char *mode; 302152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 303152b3e56SJunchao Zhang PetscScalarKokkosView yv; 304076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3058c3ff71bSJunchao Zhang 3068c3ff71bSJunchao Zhang PetscFunctionBegin; 307eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 311152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 312076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313152b3e56SJunchao Zhang mode = "N"; 314152b3e56SJunchao Zhang } else { 315076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 317152b3e56SJunchao Zhang mode = "C"; 318152b3e56SJunchao Zhang } 319076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 322076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3248c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3258c3ff71bSJunchao Zhang } 3268c3ff71bSJunchao Zhang 3278c3ff71bSJunchao Zhang /* z = A x + y */ 3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3298c3ff71bSJunchao Zhang { 3308c3ff71bSJunchao Zhang PetscErrorCode ierr; 3318c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 332152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 333152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3348c3ff71bSJunchao Zhang 3358c3ff71bSJunchao Zhang PetscFunctionBegin; 336eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3418c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3428c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3508c3ff71bSJunchao Zhang } 3518c3ff71bSJunchao Zhang 3528c3ff71bSJunchao Zhang /* z = A^T x + y */ 3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3548c3ff71bSJunchao Zhang { 3558c3ff71bSJunchao Zhang PetscErrorCode ierr; 3568c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 357152b3e56SJunchao Zhang const char *mode; 358152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 359152b3e56SJunchao Zhang PetscScalarKokkosView zv; 360076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3618c3ff71bSJunchao Zhang 3628c3ff71bSJunchao Zhang PetscFunctionBegin; 363eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3688c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 369152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 370076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371152b3e56SJunchao Zhang mode = "N"; 372152b3e56SJunchao Zhang } else { 373076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 375152b3e56SJunchao Zhang mode = "T"; 376152b3e56SJunchao Zhang } 377076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 381076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3838c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3848c3ff71bSJunchao Zhang } 3858c3ff71bSJunchao Zhang 3868c3ff71bSJunchao Zhang /* z = A^H x + y */ 3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3888c3ff71bSJunchao Zhang { 3898c3ff71bSJunchao Zhang PetscErrorCode ierr; 3908c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 391152b3e56SJunchao Zhang const char *mode; 392152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 393152b3e56SJunchao Zhang PetscScalarKokkosView zv; 394076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3958c3ff71bSJunchao Zhang 3968c3ff71bSJunchao Zhang PetscFunctionBegin; 397eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3988c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 4028c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 403152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 404076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405152b3e56SJunchao Zhang mode = "N"; 406152b3e56SJunchao Zhang } else { 407076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 409152b3e56SJunchao Zhang mode = "C"; 410152b3e56SJunchao Zhang } 411076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 415076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417152b3e56SJunchao Zhang PetscFunctionReturn(0); 418152b3e56SJunchao Zhang } 419152b3e56SJunchao Zhang 420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421152b3e56SJunchao Zhang { 422152b3e56SJunchao Zhang PetscErrorCode ierr; 423152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424152b3e56SJunchao Zhang 425152b3e56SJunchao Zhang PetscFunctionBegin; 426152b3e56SJunchao Zhang switch (op) { 427152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 428152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 431152b3e56SJunchao Zhang break; 432152b3e56SJunchao Zhang default: 433152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434152b3e56SJunchao Zhang break; 435152b3e56SJunchao Zhang } 4368c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4378c3ff71bSJunchao Zhang } 4388c3ff71bSJunchao Zhang 439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4418c3ff71bSJunchao Zhang { 4428c3ff71bSJunchao Zhang PetscErrorCode ierr; 443076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4448c3ff71bSJunchao Zhang 4458c3ff71bSJunchao Zhang PetscFunctionBegin; 446a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4498c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4522c71b3e2SJacob Faibussowitsch PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 458394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4592c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4618c3ff71bSJunchao Zhang } 462076ba34aSJunchao Zhang } 4638c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4648c3ff71bSJunchao Zhang } 4658c3ff71bSJunchao Zhang 466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468076ba34aSJunchao Zhang */ 469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4708c3ff71bSJunchao Zhang { 4718c3ff71bSJunchao Zhang PetscErrorCode ierr; 472076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 473076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474076ba34aSJunchao Zhang Mat mat; 4758c3ff71bSJunchao Zhang 4768c3ff71bSJunchao Zhang PetscFunctionBegin; 477076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479076ba34aSJunchao Zhang mat = *B; 480076ba34aSJunchao Zhang if (A->assembled) { 481076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484076ba34aSJunchao Zhang /* Now copy values to B if needed */ 485076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 486076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 487076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 489076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 490076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 492076ba34aSJunchao Zhang } 493076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 496076ba34aSJunchao Zhang } 497076ba34aSJunchao Zhang mat->spptr = bkok; 498076ba34aSJunchao Zhang } 499076ba34aSJunchao Zhang 500076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 5048c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5058c3ff71bSJunchao Zhang } 5068c3ff71bSJunchao Zhang 5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5080ecb592aSJunchao Zhang { 5090ecb592aSJunchao Zhang PetscErrorCode ierr; 5100ecb592aSJunchao Zhang Mat At; 511ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5120ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5130ecb592aSJunchao Zhang 5140ecb592aSJunchao Zhang PetscFunctionBegin; 5150ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5160ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 517ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 518ff751488SJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 5190ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5200ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5217a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5220ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5230ecb592aSJunchao Zhang if ((*B)->assembled) { 5240ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5250ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5260ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5270ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5280ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5290ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5300ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5310ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5320ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5330ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5340ecb592aSJunchao Zhang } 5350ecb592aSJunchao Zhang PetscFunctionReturn(0); 5360ecb592aSJunchao Zhang } 5370ecb592aSJunchao Zhang 5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5398c3ff71bSJunchao Zhang { 5408c3ff71bSJunchao Zhang PetscErrorCode ierr; 54186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5428c3ff71bSJunchao Zhang 5438c3ff71bSJunchao Zhang PetscFunctionBegin; 54486a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54586a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5468c3ff71bSJunchao Zhang delete aijkok; 54786a27549SJunchao Zhang } else { 54886a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 54986a27549SJunchao Zhang } 550152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 55142550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 55242550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 553152b3e56SJunchao Zhang A->spptr = NULL; 5548c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5558c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5568c3ff71bSJunchao Zhang } 5578c3ff71bSJunchao Zhang 558*3f3ba80aSJunchao Zhang /*MC 559*3f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 560*3f3ba80aSJunchao Zhang 561*3f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 562*3f3ba80aSJunchao Zhang 563*3f3ba80aSJunchao Zhang Options Database Keys: 564*3f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 565*3f3ba80aSJunchao Zhang 566*3f3ba80aSJunchao Zhang Level: beginner 567*3f3ba80aSJunchao Zhang 568*3f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 569*3f3ba80aSJunchao Zhang M*/ 57086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 57186a27549SJunchao Zhang { 57286a27549SJunchao Zhang PetscErrorCode ierr; 57386a27549SJunchao Zhang 57486a27549SJunchao Zhang PetscFunctionBegin; 57586a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 57686a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 57786a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 57886a27549SJunchao Zhang PetscFunctionReturn(0); 57986a27549SJunchao Zhang } 58086a27549SJunchao Zhang 581076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 582076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 583a3f881fbSStefano Zampini { 584076ba34aSJunchao Zhang PetscErrorCode ierr; 585076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 586076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 587076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 588076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 589076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 590076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 591a3f881fbSStefano Zampini 592a3f881fbSStefano Zampini PetscFunctionBegin; 593076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 594076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 595076ba34aSJunchao Zhang PetscValidPointer(C,4); 596076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 597076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 5982c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5992c71b3e2SJacob Faibussowitsch PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 600076ba34aSJunchao Zhang 601076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 602076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 603076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 604076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 605076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 606076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 607076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 608076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 609076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 610076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 611076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 612076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 613076ba34aSJunchao Zhang nnz = a->nz + b->nz; 614076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 615076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 616076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 617076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 618076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 619076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 620076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 621076ba34aSJunchao Zhang ca = ca_dual.view_device(); 622076ba34aSJunchao Zhang ci = ci_dual.view_device(); 623076ba34aSJunchao Zhang cj = cj_dual.view_device(); 624076ba34aSJunchao Zhang 625076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 626076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 627076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 628076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 629076ba34aSJunchao Zhang 630076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 631076ba34aSJunchao Zhang ci(i) = coffset; 632076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 633076ba34aSJunchao Zhang }); 634076ba34aSJunchao Zhang 635076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 636076ba34aSJunchao Zhang if (k < alen) { 637076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 638076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 639076ba34aSJunchao Zhang } else { 640076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 641076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 642076ba34aSJunchao Zhang } 643076ba34aSJunchao Zhang }); 644076ba34aSJunchao Zhang }); 645076ba34aSJunchao Zhang ca_dual.modify_device(); 646076ba34aSJunchao Zhang ci_dual.modify_device(); 647076ba34aSJunchao Zhang cj_dual.modify_device(); 648076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 649076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 650076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 651076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 652076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 653076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 654076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 655076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 656076ba34aSJunchao Zhang 657076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 658076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 659076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 660076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 661076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 662076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 663076ba34aSJunchao Zhang }); 664076ba34aSJunchao Zhang }); 665076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 666076ba34aSJunchao Zhang } 667076ba34aSJunchao Zhang PetscFunctionReturn(0); 668076ba34aSJunchao Zhang } 669076ba34aSJunchao Zhang 670076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 671076ba34aSJunchao Zhang { 672076ba34aSJunchao Zhang PetscFunctionBegin; 673076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 674a3f881fbSStefano Zampini PetscFunctionReturn(0); 675a3f881fbSStefano Zampini } 676a3f881fbSStefano Zampini 677a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 678a3f881fbSStefano Zampini { 679076ba34aSJunchao Zhang PetscErrorCode ierr; 680a3f881fbSStefano Zampini Mat_Product *product = C->product; 681a3f881fbSStefano Zampini Mat A,B; 682076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 683a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 684a3f881fbSStefano Zampini Mat_SeqAIJ *c; 685076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 686076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 687a3f881fbSStefano Zampini 688a3f881fbSStefano Zampini PetscFunctionBegin; 689a3f881fbSStefano Zampini MatCheckProduct(C,1); 6902c71b3e2SJacob Faibussowitsch PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 691076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 692076ba34aSJunchao Zhang 693076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 694076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 695076ba34aSJunchao Zhang PetscFunctionReturn(0); 696076ba34aSJunchao Zhang } 697076ba34aSJunchao Zhang 698076ba34aSJunchao Zhang switch (product->type) { 699076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 700076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 701076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 702076ba34aSJunchao Zhang default: 70398921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 704076ba34aSJunchao Zhang } 705076ba34aSJunchao Zhang 706a3f881fbSStefano Zampini A = product->A; 707a3f881fbSStefano Zampini B = product->B; 708a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 709a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 710a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 711a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 712a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 713076ba34aSJunchao Zhang 7142c71b3e2SJacob Faibussowitsch PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 715076ba34aSJunchao Zhang 716076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 717076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 718076ba34aSJunchao Zhang 719076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 720076ba34aSJunchao Zhang if (transA) { 721076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 722076ba34aSJunchao Zhang transA = false; 723a3f881fbSStefano Zampini } 724a3f881fbSStefano Zampini 725076ba34aSJunchao Zhang if (transB) { 726076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 727076ba34aSJunchao Zhang transB = false; 728076ba34aSJunchao Zhang } 729eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 730076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 731076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 732eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 733076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 734a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 735a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 7367d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 737a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 7387d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 739a3f881fbSStefano Zampini c->reallocs = 0; 740076ba34aSJunchao Zhang C->info.mallocs = 0; 741a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 742a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 743a3f881fbSStefano Zampini C->num_ass++; 744a3f881fbSStefano Zampini PetscFunctionReturn(0); 745a3f881fbSStefano Zampini } 746a3f881fbSStefano Zampini 747a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 748a3f881fbSStefano Zampini { 749a3f881fbSStefano Zampini PetscErrorCode ierr; 750076ba34aSJunchao Zhang Mat_Product *product = C->product; 751076ba34aSJunchao Zhang MatProductType ptype; 752076ba34aSJunchao Zhang Mat A,B; 753076ba34aSJunchao Zhang bool transA,transB; 754076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 755076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 756076ba34aSJunchao Zhang MPI_Comm comm; 757076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 758a3f881fbSStefano Zampini 759a3f881fbSStefano Zampini PetscFunctionBegin; 760a3f881fbSStefano Zampini MatCheckProduct(C,1); 761076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 7622c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 763a3f881fbSStefano Zampini A = product->A; 764a3f881fbSStefano Zampini B = product->B; 765a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 766a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 767a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 768a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 769076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 770076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 771076ba34aSJunchao Zhang 772a3f881fbSStefano Zampini ptype = product->type; 773a3f881fbSStefano Zampini switch (ptype) { 774076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 775076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 776076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 777a3f881fbSStefano Zampini default: 77898921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 779a3f881fbSStefano Zampini } 780a3f881fbSStefano Zampini 781076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 782076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 783076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 784076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 785a3f881fbSStefano Zampini 786076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 787076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7886ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7896ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7906ffb9508SJunchao Zhang */ 791076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 792076ba34aSJunchao Zhang 793eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 794076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 795076ba34aSJunchao Zhang if (transA) { 796076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 797076ba34aSJunchao Zhang transA = false; 798076ba34aSJunchao Zhang } 799076ba34aSJunchao Zhang 800076ba34aSJunchao Zhang if (transB) { 801076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 802076ba34aSJunchao Zhang transB = false; 803076ba34aSJunchao Zhang } 804076ba34aSJunchao Zhang 805076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 806076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 807076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 808076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 809076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 810076ba34aSJunchao Zhang */ 811076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 812076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 813eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 814076ba34aSJunchao Zhang 815076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 816076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 817076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 818a3f881fbSStefano Zampini PetscFunctionReturn(0); 819a3f881fbSStefano Zampini } 820a3f881fbSStefano Zampini 821a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 822076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 823a3f881fbSStefano Zampini { 824a3f881fbSStefano Zampini PetscErrorCode ierr; 825076ba34aSJunchao Zhang Mat_Product *product = mat->product; 826a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827a3f881fbSStefano Zampini 828a3f881fbSStefano Zampini PetscFunctionBegin; 829a3f881fbSStefano Zampini MatCheckProduct(mat,1); 830a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 831a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 832a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 833a3f881fbSStefano Zampini } 834a3f881fbSStefano Zampini if (Biskok && Ciskok) { 835a3f881fbSStefano Zampini switch (product->type) { 836a3f881fbSStefano Zampini case MATPRODUCT_AB: 837a3f881fbSStefano Zampini case MATPRODUCT_AtB: 838a3f881fbSStefano Zampini case MATPRODUCT_ABt: 839a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840a3f881fbSStefano Zampini break; 841a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 842a3f881fbSStefano Zampini case MATPRODUCT_RARt: 843a3f881fbSStefano Zampini case MATPRODUCT_ABC: 844a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845a3f881fbSStefano Zampini break; 846a3f881fbSStefano Zampini default: 847a3f881fbSStefano Zampini break; 848a3f881fbSStefano Zampini } 849a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 850a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 851a3f881fbSStefano Zampini } 852a3f881fbSStefano Zampini PetscFunctionReturn(0); 853a3f881fbSStefano Zampini } 854a587d139SMark 855f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856f0cf5187SStefano Zampini { 857f0cf5187SStefano Zampini PetscErrorCode ierr; 858f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 859f0cf5187SStefano Zampini 860f0cf5187SStefano Zampini PetscFunctionBegin; 861eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 862f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 863f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 864076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 865076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 866076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 8676af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 868f0cf5187SStefano Zampini PetscFunctionReturn(0); 869f0cf5187SStefano Zampini } 870f0cf5187SStefano Zampini 871a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 872a587d139SMark { 873a587d139SMark PetscErrorCode ierr; 874076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 875a587d139SMark 876a587d139SMark PetscFunctionBegin; 877076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8782328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 879076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 880076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8812328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8822328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8832328674fSJunchao Zhang } 884a587d139SMark PetscFunctionReturn(0); 885a587d139SMark } 886a587d139SMark 887db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 88842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 889db78de30SJunchao Zhang { 890db78de30SJunchao Zhang PetscErrorCode ierr; 891db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 892db78de30SJunchao Zhang 893db78de30SJunchao Zhang PetscFunctionBegin; 894db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 895db78de30SJunchao Zhang PetscValidPointer(kv,2); 896db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 897db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 898db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 899076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 900db78de30SJunchao Zhang PetscFunctionReturn(0); 901db78de30SJunchao Zhang } 902db78de30SJunchao Zhang 90342550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 904db78de30SJunchao Zhang { 905db78de30SJunchao Zhang PetscFunctionBegin; 906db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 907db78de30SJunchao Zhang PetscValidPointer(kv,2); 908db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 909db78de30SJunchao Zhang PetscFunctionReturn(0); 910db78de30SJunchao Zhang } 911db78de30SJunchao Zhang 91242550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 913db78de30SJunchao Zhang { 914db78de30SJunchao Zhang PetscErrorCode ierr; 915db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 916db78de30SJunchao Zhang 917db78de30SJunchao Zhang PetscFunctionBegin; 918db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 919db78de30SJunchao Zhang PetscValidPointer(kv,2); 920db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 921db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 922db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 923076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 924db78de30SJunchao Zhang PetscFunctionReturn(0); 925db78de30SJunchao Zhang } 926db78de30SJunchao Zhang 92742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 928db78de30SJunchao Zhang { 929db78de30SJunchao Zhang PetscErrorCode ierr; 930db78de30SJunchao Zhang 931db78de30SJunchao Zhang PetscFunctionBegin; 932db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 933db78de30SJunchao Zhang PetscValidPointer(kv,2); 934db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 935076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 936db78de30SJunchao Zhang PetscFunctionReturn(0); 937db78de30SJunchao Zhang } 938db78de30SJunchao Zhang 93942550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 940db78de30SJunchao Zhang { 941db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 942db78de30SJunchao Zhang 943db78de30SJunchao Zhang PetscFunctionBegin; 944db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945db78de30SJunchao Zhang PetscValidPointer(kv,2); 946db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 948076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 949db78de30SJunchao Zhang PetscFunctionReturn(0); 950db78de30SJunchao Zhang } 951db78de30SJunchao Zhang 95242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 953db78de30SJunchao Zhang { 954db78de30SJunchao Zhang PetscErrorCode ierr; 955db78de30SJunchao Zhang 956db78de30SJunchao Zhang PetscFunctionBegin; 957db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 958db78de30SJunchao Zhang PetscValidPointer(kv,2); 959db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 960076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 961db78de30SJunchao Zhang PetscFunctionReturn(0); 962db78de30SJunchao Zhang } 963db78de30SJunchao Zhang 964c17cf699SJunchao Zhang /* Computes Y += alpha X */ 965c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 966a587d139SMark { 967a587d139SMark PetscErrorCode ierr; 968a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 969c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 970c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 971c17cf699SJunchao Zhang MatScalarKokkosView Ya; 972a587d139SMark 973a587d139SMark PetscFunctionBegin; 974c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 975c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 976c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 977c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 978eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 979db78de30SJunchao Zhang 980c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 981c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 982a587d139SMark PetscBool e; 983a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 984a587d139SMark if (e) { 985a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 986c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 987a587d139SMark } 988a587d139SMark } 989db78de30SJunchao Zhang 990c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 991c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 992c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 993c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 994c17cf699SJunchao Zhang */ 995c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 996c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 997c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 998c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 999c17cf699SJunchao Zhang 1000c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1001c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 1002c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1003c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1004c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1005c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1006c17cf699SJunchao Zhang 1007c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1008c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1009c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1010c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 1011c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1012c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1013c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1014c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1015c17cf699SJunchao Zhang q++; 1016a587d139SMark } else { 1017c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1018c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1019c17cf699SJunchao Zhang */ 1020c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1021a587d139SMark } 1022c17cf699SJunchao Zhang } 1023c17cf699SJunchao Zhang }); 1024c17cf699SJunchao Zhang }); 1025c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1026c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1027c17cf699SJunchao Zhang Mat Z; 1028c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1029c17cf699SJunchao Zhang KernelHandle kh; 1030c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1031c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1032c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1033c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1034c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1035c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1036c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1037c17cf699SJunchao Zhang } 1038eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1039eeadb341SJunchao Zhang ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1040a587d139SMark PetscFunctionReturn(0); 1041a587d139SMark } 1042a587d139SMark 1043394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 104442550becSJunchao Zhang { 104542550becSJunchao Zhang PetscErrorCode ierr; 1046394ed5ebSJunchao Zhang Mat newmat; 104742550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 104842550becSJunchao Zhang Mat_SeqAIJ *aseq; 104942550becSJunchao Zhang 105042550becSJunchao Zhang PetscFunctionBegin; 1051394ed5ebSJunchao Zhang ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1052394ed5ebSJunchao Zhang ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1053394ed5ebSJunchao Zhang ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 1054394ed5ebSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 105542550becSJunchao Zhang ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 105642550becSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 105742550becSJunchao Zhang ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1058394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 105942550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1060394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 106142550becSJunchao Zhang PetscFunctionReturn(0); 106242550becSJunchao Zhang } 106342550becSJunchao Zhang 106442550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 106542550becSJunchao Zhang { 106642550becSJunchao Zhang PetscErrorCode ierr; 106742550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 106842550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1069394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1070394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1071394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 107242550becSJunchao Zhang MatScalarKokkosView Aa; 107342550becSJunchao Zhang ConstMatScalarKokkosView kv; 107442550becSJunchao Zhang PetscMemType memtype; 107542550becSJunchao Zhang 107642550becSJunchao Zhang PetscFunctionBegin; 1077394ed5ebSJunchao Zhang PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 107842550becSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 107942550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1080394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 108142550becSJunchao Zhang } else { 1082394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 108342550becSJunchao Zhang } 108442550becSJunchao Zhang 1085394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */ 1086394ed5ebSJunchao Zhang else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 108742550becSJunchao Zhang 1088394ed5ebSJunchao Zhang Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1089b6c38306SJunchao Zhang PetscScalar sum = 0.0; 1090b6c38306SJunchao Zhang for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1091b6c38306SJunchao Zhang Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 109242550becSJunchao Zhang }); 1093394ed5ebSJunchao Zhang 1094394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1095394ed5ebSJunchao Zhang else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 109642550becSJunchao Zhang PetscFunctionReturn(0); 109742550becSJunchao Zhang } 109842550becSJunchao Zhang 109986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 11008f7e8f9dSMark Adams { 11018f7e8f9dSMark Adams PetscErrorCode ierr; 11028f7e8f9dSMark Adams 11038f7e8f9dSMark Adams PetscFunctionBegin; 11048f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 11058f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 11068f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 11078f7e8f9dSMark Adams PetscFunctionReturn(0); 11088f7e8f9dSMark Adams } 11098f7e8f9dSMark Adams 11108c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 11118c3ff71bSJunchao Zhang { 111242550becSJunchao Zhang PetscErrorCode ierr; 1113076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1114076ba34aSJunchao Zhang 11158c3ff71bSJunchao Zhang PetscFunctionBegin; 1116076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 11176f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 11186f3d89d0SStefano Zampini 11198c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 11208c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 11218c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1122a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1123f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1124a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1125076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11268c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11278c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11288c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11298c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11308c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11318c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1132076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11330ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1134152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1135076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1136076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1137076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1138076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1139076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1140076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 114142550becSJunchao Zhang 114242550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 114342550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1144076ba34aSJunchao Zhang PetscFunctionReturn(0); 1145076ba34aSJunchao Zhang } 1146076ba34aSJunchao Zhang 1147076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1148076ba34aSJunchao Zhang { 1149076ba34aSJunchao Zhang PetscErrorCode ierr; 1150076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1151076ba34aSJunchao Zhang PetscInt i,m,n; 1152076ba34aSJunchao Zhang 1153076ba34aSJunchao Zhang PetscFunctionBegin; 11542c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1155076ba34aSJunchao Zhang 1156076ba34aSJunchao Zhang m = akok->nrows(); 1157076ba34aSJunchao Zhang n = akok->ncols(); 1158076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1159076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1160076ba34aSJunchao Zhang 1161076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1162076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1163076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1164076ba34aSJunchao Zhang 1165076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1166076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1167076ba34aSJunchao Zhang 1168076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1169076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1170076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1171076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1172076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1173076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1174076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1175076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1176076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1177076ba34aSJunchao Zhang 1178076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1179076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1180076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1181076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1182076ba34aSJunchao Zhang } 1183076ba34aSJunchao Zhang 1184076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1185076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1186ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1187076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1188076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1189076ba34aSJunchao Zhang PetscFunctionReturn(0); 1190076ba34aSJunchao Zhang } 1191076ba34aSJunchao Zhang 1192076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1193076ba34aSJunchao Zhang 1194076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1195076ba34aSJunchao Zhang */ 1196076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1197076ba34aSJunchao Zhang { 1198076ba34aSJunchao Zhang PetscErrorCode ierr; 1199076ba34aSJunchao Zhang 1200076ba34aSJunchao Zhang PetscFunctionBegin; 1201076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1202076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 12038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12048c3ff71bSJunchao Zhang } 12058c3ff71bSJunchao Zhang 12068c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1207152b3e56SJunchao Zhang /*@C 12088c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 12098c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12108c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 12118c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12128c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12138c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 12148c3ff71bSJunchao Zhang 12158c3ff71bSJunchao Zhang Collective 12168c3ff71bSJunchao Zhang 12178c3ff71bSJunchao Zhang Input Parameters: 12188c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 12198c3ff71bSJunchao Zhang . m - number of rows 12208c3ff71bSJunchao Zhang . n - number of columns 12218c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 12228c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 12238c3ff71bSJunchao Zhang (possibly different for each row) or NULL 12248c3ff71bSJunchao Zhang 12258c3ff71bSJunchao Zhang Output Parameter: 12268c3ff71bSJunchao Zhang . A - the matrix 12278c3ff71bSJunchao Zhang 12288c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12298c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12308c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12318c3ff71bSJunchao Zhang 12328c3ff71bSJunchao Zhang Notes: 12338c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12348c3ff71bSJunchao Zhang 12358c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12368c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12378c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12388c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12398c3ff71bSJunchao Zhang 12408c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12418c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12428c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12438c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12448c3ff71bSJunchao Zhang 12458c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12468c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12478c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12488c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12498c3ff71bSJunchao Zhang 12508c3ff71bSJunchao Zhang Level: intermediate 12518c3ff71bSJunchao Zhang 1252076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 12538c3ff71bSJunchao Zhang @*/ 12548c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12558c3ff71bSJunchao Zhang { 12568c3ff71bSJunchao Zhang PetscErrorCode ierr; 12578c3ff71bSJunchao Zhang 12588c3ff71bSJunchao Zhang PetscFunctionBegin; 12598c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 12608c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 12618c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 12628c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 12638c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 12648c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12658c3ff71bSJunchao Zhang } 1266930e68a5SMark Adams 12678f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12688f7e8f9dSMark Adams // 126946804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12708f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12718f7e8f9dSMark Adams // 12728f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1273930e68a5SMark Adams { 12748f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12758f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12768f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1277930e68a5SMark Adams PetscErrorCode ierr; 12788f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1279300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1280076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1281076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12828f7e8f9dSMark Adams PetscBool row_identity,col_identity; 128346804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1284930e68a5SMark Adams 1285930e68a5SMark Adams PetscFunctionBegin; 12862c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12878f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 12882c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12898f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12908f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12918f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12928f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12938f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12948f7e8f9dSMark Adams { 12958f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12968f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12978f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12988f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12998f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 13008f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 13018f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 13028f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 13038f7e8f9dSMark Adams size_t flops_h = 0.0; 13048f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 13058f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 13068f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 13078f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 13088f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 13098f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 13108f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 13118f7e8f9dSMark Adams // Fill A --> fact 13128f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1313042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 13148f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 13158f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13168f7e8f9dSMark Adams // zero rows of B 13178f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13188f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 13198f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 13208f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 13218f7e8f9dSMark Adams /* zero (unfactored row) */ 13228f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 13238f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 13248f7e8f9dSMark Adams }); 13258f7e8f9dSMark Adams // copy A into B 13268f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13278f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 13288f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 13298f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 13308f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13318f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 13328f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13338f7e8f9dSMark Adams PetscScalar vala = av[j]; 13348f7e8f9dSMark Adams if (colb == rowb) { 13358f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13368f7e8f9dSMark Adams } else { 13378f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13388f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13398f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 13408f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 13418f7e8f9dSMark Adams if (pbj[j] == colb) { 13428f7e8f9dSMark Adams pba[j] = vala; 13438f7e8f9dSMark Adams set++; 13448f7e8f9dSMark Adams break; 13458f7e8f9dSMark Adams } 13468f7e8f9dSMark Adams } 13478f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13488f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13498f1da0b2SJunchao Zhang #endif 13508f7e8f9dSMark Adams } 13518f7e8f9dSMark Adams } 13528f7e8f9dSMark Adams }); 13538f7e8f9dSMark Adams }); 13548f7e8f9dSMark Adams Kokkos::fence(); 1355930e68a5SMark Adams 13568f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13578f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13588f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13598f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1360042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13618f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 13628f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13638f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13648f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 13658f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 13668f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 13678f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 13688f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13698f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13708f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13718f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13728f7e8f9dSMark Adams else { 13738f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13748f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13758f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13768f7e8f9dSMark Adams size_t st_idx; 13778f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13788f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13798f7e8f9dSMark Adams // get column, there has got to be a better way 13808f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13818f7e8f9dSMark Adams if (pjL[j] == ii) { 13828f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13838f7e8f9dSMark Adams idx = j; // output 13848f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13858f7e8f9dSMark Adams } 13868f7e8f9dSMark Adams }, st_idx); 13878f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13888f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 138999551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 139099551766SMark Adams #endif 139199551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13928f7e8f9dSMark Adams // U(i+1,:end) 13938f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13948f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13958f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13968f7e8f9dSMark Adams if (col==myk) { 13978f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13988f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13998f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 14008f7e8f9dSMark Adams } else { 14018f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 14028f7e8f9dSMark Adams PetscInt high, low; 14038f7e8f9dSMark Adams const PetscInt *startj; 14048f7e8f9dSMark Adams if (col<myk) { // L 14058f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 14068f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 14078f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 14088f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 14098f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 14108f7e8f9dSMark Adams } else { 14118f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 14128f7e8f9dSMark Adams start = ba_d + idx; 14138f7e8f9dSMark Adams startj= bj_d + idx; 14148f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 14158f7e8f9dSMark Adams } 14168f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 14178f7e8f9dSMark Adams low = 0; 14188f7e8f9dSMark Adams high = (PetscInt)(end-start); 14198f7e8f9dSMark Adams while (high-low > 5) { 14208f7e8f9dSMark Adams int t = (low+high)/2; 14218f7e8f9dSMark Adams if (startj[t] > col) high = t; 14228f7e8f9dSMark Adams else low = t; 14238f7e8f9dSMark Adams } 14248f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 14258f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 14268f7e8f9dSMark Adams } 14278f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 142899551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 142999551766SMark Adams #endif 14308f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14318f7e8f9dSMark Adams } 14328f7e8f9dSMark Adams }); 14338f7e8f9dSMark Adams } 14348f7e8f9dSMark Adams }); 14358f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14368f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 14378f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 14388f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 14398f7e8f9dSMark Adams }); 14408f7e8f9dSMark Adams Kokkos::fence(); 14418f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 14428f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 14438f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 14448f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 14458f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 14468f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14478f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 14488f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14498f7e8f9dSMark Adams if (i < end) { 14508f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14518f7e8f9dSMark Adams *pv = 1.0/(*pv); 14528f7e8f9dSMark Adams } 14538f7e8f9dSMark Adams }); 14548f7e8f9dSMark Adams }); 14558f7e8f9dSMark Adams } 14568f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 14578f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 14588f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 14598f7e8f9dSMark Adams 14608f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 14618f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 14628f7e8f9dSMark Adams if (b->inode.size) { 14638f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14648f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14658f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14668f7e8f9dSMark Adams } else { 14678f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14688f7e8f9dSMark Adams } 14698f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14708f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 14718f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14728f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14738f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14748f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14758f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14768f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14778f7e8f9dSMark Adams 1478930e68a5SMark Adams PetscFunctionReturn(0); 1479930e68a5SMark Adams } 1480930e68a5SMark Adams 148186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1482930e68a5SMark Adams { 1483930e68a5SMark Adams PetscErrorCode ierr; 1484930e68a5SMark Adams 1485930e68a5SMark Adams PetscFunctionBegin; 1486930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 148786a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 148886a27549SJunchao Zhang PetscFunctionReturn(0); 148986a27549SJunchao Zhang } 149086a27549SJunchao Zhang 149186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 149286a27549SJunchao Zhang { 149386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 149486a27549SJunchao Zhang 149586a27549SJunchao Zhang PetscFunctionBegin; 149686a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 149786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 149886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 149986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 150086a27549SJunchao Zhang } 150186a27549SJunchao Zhang PetscFunctionReturn(0); 150286a27549SJunchao Zhang } 150386a27549SJunchao Zhang 150486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 150586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 150686a27549SJunchao Zhang { 150786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1508076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 150986a27549SJunchao Zhang 151086a27549SJunchao Zhang PetscFunctionBegin; 151186a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 151286a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1513076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 151486a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1515076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1516076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 151786a27549SJunchao Zhang 151886a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1519076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1520076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1521076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 152286a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 152386a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 152486a27549SJunchao Zhang 152586a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 152686a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 152786a27549SJunchao Zhang */ 1528076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1529076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 153086a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 153186a27549SJunchao Zhang 153286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 153386a27549SJunchao Zhang 153486a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1535076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 153686a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1537076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1538076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 153986a27549SJunchao Zhang 154086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1541076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1542076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1543076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 154486a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 154586a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 154686a27549SJunchao Zhang 154786a27549SJunchao Zhang /* Sort indices. See comments above */ 1548076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1549076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 155086a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 155186a27549SJunchao Zhang 155286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 155386a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 155486a27549SJunchao Zhang } 155586a27549SJunchao Zhang PetscFunctionReturn(0); 155686a27549SJunchao Zhang } 155786a27549SJunchao Zhang 155886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 155986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 156086a27549SJunchao Zhang { 156186a27549SJunchao Zhang PetscErrorCode ierr; 156286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 156386a27549SJunchao Zhang PetscScalarKokkosView xv; 156486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 156586a27549SJunchao Zhang 156686a27549SJunchao Zhang PetscFunctionBegin; 1567eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156886a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 156986a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1570ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 157186a27549SJunchao Zhang /* Solve L tmpv = b */ 15723f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 157386a27549SJunchao Zhang /* Solve Ux = tmpv */ 15743f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 157586a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1576ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1577eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 157886a27549SJunchao Zhang PetscFunctionReturn(0); 157986a27549SJunchao Zhang } 158086a27549SJunchao Zhang 1581076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 158286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 158386a27549SJunchao Zhang { 158486a27549SJunchao Zhang PetscErrorCode ierr; 158586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 158686a27549SJunchao Zhang PetscScalarKokkosView xv; 158786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 158886a27549SJunchao Zhang 158986a27549SJunchao Zhang PetscFunctionBegin; 1590eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 159186a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 159286a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1593ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 159486a27549SJunchao Zhang /* Solve U^T tmpv = b */ 159586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 159686a27549SJunchao Zhang 159786a27549SJunchao Zhang /* Solve L^T x = tmpv */ 159886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 159986a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1600ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1601eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 160286a27549SJunchao Zhang PetscFunctionReturn(0); 160386a27549SJunchao Zhang } 160486a27549SJunchao Zhang 160586a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 160686a27549SJunchao Zhang { 160786a27549SJunchao Zhang PetscErrorCode ierr; 160886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 160986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 161086a27549SJunchao Zhang PetscInt fill_lev = info->levels; 161186a27549SJunchao Zhang 161286a27549SJunchao Zhang PetscFunctionBegin; 1613eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 161486a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1615076ba34aSJunchao Zhang 1616076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1617076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1618076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1619076ba34aSJunchao Zhang 1620076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 162186a27549SJunchao Zhang 162286a27549SJunchao Zhang B->assembled = PETSC_TRUE; 162386a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 162486a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 162586a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 162686a27549SJunchao Zhang B->ops->matsolve = NULL; 162786a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 162886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162986a27549SJunchao Zhang 163086a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 163186a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 163286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1633eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1634eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 163586a27549SJunchao Zhang PetscFunctionReturn(0); 163686a27549SJunchao Zhang } 163786a27549SJunchao Zhang 163886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 163986a27549SJunchao Zhang { 164086a27549SJunchao Zhang PetscErrorCode ierr; 164186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 164286a27549SJunchao Zhang Mat_SeqAIJ *b; 164386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 164486a27549SJunchao Zhang PetscInt fill_lev = info->levels; 164586a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 164686a27549SJunchao Zhang PetscInt n = A->rmap->n; 164786a27549SJunchao Zhang 164886a27549SJunchao Zhang PetscFunctionBegin; 164986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 165086a27549SJunchao Zhang /* Rebuild factors */ 165186a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 165286a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 165386a27549SJunchao Zhang 165486a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 165586a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 165686a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 165786a27549SJunchao Zhang 165886a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 165986a27549SJunchao Zhang 166086a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 166186a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 166286a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 166386a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 166486a27549SJunchao Zhang 166586a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1666076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1667076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1668076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 166986a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 167086a27549SJunchao Zhang 167186a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 167286a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 167386a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 167486a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 167586a27549SJunchao Zhang 167686a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 167786a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 167886a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 167986a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 168086a27549SJunchao Zhang #else 168186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 168286a27549SJunchao Zhang #endif 168386a27549SJunchao Zhang 168486a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 168586a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 168686a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 168786a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 168886a27549SJunchao Zhang 168986a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 169086a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169186a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 169286a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 169386a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 169486a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 169586a27549SJunchao Zhang 169686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 169786a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1698930e68a5SMark Adams PetscFunctionReturn(0); 1699930e68a5SMark Adams } 1700930e68a5SMark Adams 17018f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 17028f7e8f9dSMark Adams { 17038f7e8f9dSMark Adams PetscErrorCode ierr; 17048f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 17058f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1706930e68a5SMark Adams 17078f7e8f9dSMark Adams PetscFunctionBegin; 17088f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 17098f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17108f7e8f9dSMark Adams // move B data into Kokkos 17118f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 17128f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 17138f7e8f9dSMark Adams { 17148f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1715300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17168f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1717300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1718300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 17198f7e8f9dSMark Adams } 17208f7e8f9dSMark Adams } 17218f7e8f9dSMark Adams PetscFunctionReturn(0); 17228f7e8f9dSMark Adams } 17238f7e8f9dSMark Adams 172486a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1725930e68a5SMark Adams { 1726930e68a5SMark Adams PetscFunctionBegin; 1727930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1728930e68a5SMark Adams PetscFunctionReturn(0); 1729930e68a5SMark Adams } 1730930e68a5SMark Adams 17318f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 17328f7e8f9dSMark Adams { 17338f7e8f9dSMark Adams PetscFunctionBegin; 17348f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 17358f7e8f9dSMark Adams PetscFunctionReturn(0); 17368f7e8f9dSMark Adams } 17378f7e8f9dSMark Adams 1738930e68a5SMark Adams /*MC 173986a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 174086a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1741930e68a5SMark Adams 1742930e68a5SMark Adams Level: beginner 1743930e68a5SMark Adams 1744*3f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1745930e68a5SMark Adams M*/ 174686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1747930e68a5SMark Adams { 1748930e68a5SMark Adams PetscErrorCode ierr; 1749930e68a5SMark Adams PetscInt n = A->rmap->n; 1750930e68a5SMark Adams 1751930e68a5SMark Adams PetscFunctionBegin; 1752930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1753930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1754930e68a5SMark Adams (*B)->factortype = ftype; 17554ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1756930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1757930e68a5SMark Adams 17588f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1759930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 176086a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 176186a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 176286a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 176386a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 176486a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 176586a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 176698921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1767930e68a5SMark Adams 1768930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 176986a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1770930e68a5SMark Adams PetscFunctionReturn(0); 1771930e68a5SMark Adams } 17728f7e8f9dSMark Adams 17738f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17748f7e8f9dSMark Adams { 17758f7e8f9dSMark Adams PetscErrorCode ierr; 17768f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17778f7e8f9dSMark Adams 17788f7e8f9dSMark Adams PetscFunctionBegin; 17798f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 17808f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 17818f7e8f9dSMark Adams (*B)->factortype = ftype; 1782f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17834ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 17848f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17858f7e8f9dSMark Adams 17868f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17878f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17888f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17898f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17908f7e8f9dSMark Adams 17918f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17928f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17938f7e8f9dSMark Adams PetscFunctionReturn(0); 17948f7e8f9dSMark Adams } 179586a27549SJunchao Zhang 179686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 179786a27549SJunchao Zhang { 179886a27549SJunchao Zhang PetscErrorCode ierr; 179986a27549SJunchao Zhang 180086a27549SJunchao Zhang PetscFunctionBegin; 180186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 180286a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 180386a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 180486a27549SJunchao Zhang PetscFunctionReturn(0); 180586a27549SJunchao Zhang } 180686a27549SJunchao Zhang 1807076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1808076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1809076ba34aSJunchao Zhang { 1810076ba34aSJunchao Zhang PetscErrorCode ierr; 1811076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1812076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1813076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1814076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1815076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1816076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1817076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1818076ba34aSJunchao Zhang 1819076ba34aSJunchao Zhang PetscFunctionBegin; 1820c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1821076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1822c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1823076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1824c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1825076ba34aSJunchao Zhang } 1826076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1827076ba34aSJunchao Zhang } 1828076ba34aSJunchao Zhang PetscFunctionReturn(0); 1829076ba34aSJunchao Zhang } 1830