111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 612c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 622c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 63076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 64076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 65580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 678c3ff71bSJunchao Zhang } 688c3ff71bSJunchao Zhang PetscFunctionReturn(0); 698c3ff71bSJunchao Zhang } 708c3ff71bSJunchao Zhang 71076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 72fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7386a27549SJunchao Zhang { 7486a27549SJunchao Zhang PetscErrorCode ierr; 7586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7686a27549SJunchao Zhang 7786a27549SJunchao Zhang PetscFunctionBegin; 782c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 7986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8086a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 832328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8486a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8586a27549SJunchao Zhang PetscFunctionReturn(0); 8686a27549SJunchao Zhang } 8786a27549SJunchao Zhang 88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89f0cf5187SStefano Zampini { 90f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91f0cf5187SStefano Zampini 92f0cf5187SStefano Zampini PetscFunctionBegin; 93f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 952c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 962c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 98f0cf5187SStefano Zampini PetscFunctionReturn(0); 99f0cf5187SStefano Zampini } 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102f0cf5187SStefano Zampini { 103076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104f0cf5187SStefano Zampini 105f0cf5187SStefano Zampini PetscFunctionBegin; 106076ba34aSJunchao Zhang if (aijkok) { 107076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 108076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 109076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 110076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111076ba34aSJunchao Zhang } 112076ba34aSJunchao Zhang PetscFunctionReturn(0); 113076ba34aSJunchao Zhang } 114076ba34aSJunchao Zhang 115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116076ba34aSJunchao Zhang { 117076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118076ba34aSJunchao Zhang 119076ba34aSJunchao Zhang PetscFunctionBegin; 120076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 121076ba34aSJunchao Zhang PetscFunctionReturn(0); 122076ba34aSJunchao Zhang } 123076ba34aSJunchao Zhang 124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125076ba34aSJunchao Zhang { 126076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 1292328674fSJunchao Zhang if (aijkok) { 130076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 131076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1322328674fSJunchao Zhang } else { 1332328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1342328674fSJunchao Zhang } 135076ba34aSJunchao Zhang PetscFunctionReturn(0); 136076ba34aSJunchao Zhang } 137076ba34aSJunchao Zhang 138076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 139076ba34aSJunchao Zhang { 140076ba34aSJunchao Zhang PetscFunctionBegin; 141076ba34aSJunchao Zhang *array = NULL; 142076ba34aSJunchao Zhang PetscFunctionReturn(0); 143076ba34aSJunchao Zhang } 144076ba34aSJunchao Zhang 145076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 146076ba34aSJunchao Zhang { 147076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 148076ba34aSJunchao Zhang 149076ba34aSJunchao Zhang PetscFunctionBegin; 1502328674fSJunchao Zhang if (aijkok) { 151076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1522328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1532328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1542328674fSJunchao Zhang } 155076ba34aSJunchao Zhang PetscFunctionReturn(0); 156076ba34aSJunchao Zhang } 157076ba34aSJunchao Zhang 158076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 159076ba34aSJunchao Zhang { 160076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 161076ba34aSJunchao Zhang 162076ba34aSJunchao Zhang PetscFunctionBegin; 1632328674fSJunchao Zhang if (aijkok) { 164076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 165076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1662328674fSJunchao Zhang } 167f0cf5187SStefano Zampini PetscFunctionReturn(0); 168f0cf5187SStefano Zampini } 169f0cf5187SStefano Zampini 170a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 171042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 172a587d139SMark { 173042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 174042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 175a587d139SMark 176a587d139SMark PetscFunctionBegin; 1772c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 178152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 179a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 180a587d139SMark PetscFunctionReturn(0); 181a587d139SMark } 182a587d139SMark 183a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 184042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 185a587d139SMark { 186042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 187a587d139SMark 188a587d139SMark PetscFunctionBegin; 189a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 190a587d139SMark *d_mat = aijkok->device_mat_d.data(); 191a587d139SMark } else { 192a587d139SMark PetscErrorCode ierr; 193a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 194a587d139SMark *d_mat = NULL; 195a587d139SMark } 196a587d139SMark PetscFunctionReturn(0); 197a587d139SMark } 198076ba34aSJunchao Zhang 199076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 200076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 201152b3e56SJunchao Zhang { 202152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 203152b3e56SJunchao Zhang 204152b3e56SJunchao Zhang PetscFunctionBegin; 2052c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 206076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 207076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 208076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 209076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 210076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 212076ba34aSJunchao Zhang } 213076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 214152b3e56SJunchao Zhang PetscFunctionReturn(0); 215152b3e56SJunchao Zhang } 216152b3e56SJunchao Zhang 217076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 218076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 219152b3e56SJunchao Zhang { 220eeadb341SJunchao Zhang PetscErrorCode ierr; 221152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222152b3e56SJunchao Zhang 223152b3e56SJunchao Zhang PetscFunctionBegin; 224eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2252c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 226076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 227076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 228076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 230076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 231076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 232076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 233076ba34aSJunchao Zhang #endif 23486a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 235076ba34aSJunchao Zhang } 236076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 237eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 238152b3e56SJunchao Zhang PetscFunctionReturn(0); 239152b3e56SJunchao Zhang } 240a587d139SMark 2418c3ff71bSJunchao Zhang /* y = A x */ 2428c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2438c3ff71bSJunchao Zhang { 2448c3ff71bSJunchao Zhang PetscErrorCode ierr; 2458c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 246152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 247152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2488c3ff71bSJunchao Zhang 2498c3ff71bSJunchao Zhang PetscFunctionBegin; 2506af1d01cSJed Brown ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2518c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 2548c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 256152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 257ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 258076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 259152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2606af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2618c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2628c3ff71bSJunchao Zhang } 2638c3ff71bSJunchao Zhang 2648c3ff71bSJunchao Zhang /* y = A^T x */ 2658c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2668c3ff71bSJunchao Zhang { 2678c3ff71bSJunchao Zhang PetscErrorCode ierr; 2688c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 269152b3e56SJunchao Zhang const char *mode; 270152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 271152b3e56SJunchao Zhang PetscScalarKokkosView yv; 272076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2738c3ff71bSJunchao Zhang 2748c3ff71bSJunchao Zhang PetscFunctionBegin; 275eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2768c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 279152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 280076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281152b3e56SJunchao Zhang mode = "N"; 282152b3e56SJunchao Zhang } else { 283076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 285152b3e56SJunchao Zhang mode = "T"; 286152b3e56SJunchao Zhang } 287076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 290076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2928c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2938c3ff71bSJunchao Zhang } 2948c3ff71bSJunchao Zhang 2958c3ff71bSJunchao Zhang /* y = A^H x */ 2968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2978c3ff71bSJunchao Zhang { 2988c3ff71bSJunchao Zhang PetscErrorCode ierr; 2998c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 300152b3e56SJunchao Zhang const char *mode; 301152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 302152b3e56SJunchao Zhang PetscScalarKokkosView yv; 303076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3048c3ff71bSJunchao Zhang 3058c3ff71bSJunchao Zhang PetscFunctionBegin; 306eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3078c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 310152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 311076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312152b3e56SJunchao Zhang mode = "N"; 313152b3e56SJunchao Zhang } else { 314076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 316152b3e56SJunchao Zhang mode = "C"; 317152b3e56SJunchao Zhang } 318076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr); 321076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3238c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3248c3ff71bSJunchao Zhang } 3258c3ff71bSJunchao Zhang 3268c3ff71bSJunchao Zhang /* z = A x + y */ 3278c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3288c3ff71bSJunchao Zhang { 3298c3ff71bSJunchao Zhang PetscErrorCode ierr; 3308c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 331152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 332152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3338c3ff71bSJunchao Zhang 3348c3ff71bSJunchao Zhang PetscFunctionBegin; 335eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3368c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3408c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3418c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3488c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3498c3ff71bSJunchao Zhang } 3508c3ff71bSJunchao Zhang 3518c3ff71bSJunchao Zhang /* z = A^T x + y */ 3528c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3538c3ff71bSJunchao Zhang { 3548c3ff71bSJunchao Zhang PetscErrorCode ierr; 3558c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 356152b3e56SJunchao Zhang const char *mode; 357152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 358152b3e56SJunchao Zhang PetscScalarKokkosView zv; 359076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3608c3ff71bSJunchao Zhang 3618c3ff71bSJunchao Zhang PetscFunctionBegin; 362eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3638c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 3678c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 368152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 369076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370152b3e56SJunchao Zhang mode = "N"; 371152b3e56SJunchao Zhang } else { 372076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 374152b3e56SJunchao Zhang mode = "T"; 375152b3e56SJunchao Zhang } 376076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 380076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3838c3ff71bSJunchao Zhang } 3848c3ff71bSJunchao Zhang 3858c3ff71bSJunchao Zhang /* z = A^H x + y */ 3868c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3878c3ff71bSJunchao Zhang { 3888c3ff71bSJunchao Zhang PetscErrorCode ierr; 3898c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 390152b3e56SJunchao Zhang const char *mode; 391152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 392152b3e56SJunchao Zhang PetscScalarKokkosView zv; 393076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3948c3ff71bSJunchao Zhang 3958c3ff71bSJunchao Zhang PetscFunctionBegin; 396eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3978c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 4018c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 402152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 403076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404152b3e56SJunchao Zhang mode = "N"; 405152b3e56SJunchao Zhang } else { 406076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 408152b3e56SJunchao Zhang mode = "C"; 409152b3e56SJunchao Zhang } 410076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr); 414076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416152b3e56SJunchao Zhang PetscFunctionReturn(0); 417152b3e56SJunchao Zhang } 418152b3e56SJunchao Zhang 419152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420152b3e56SJunchao Zhang { 421152b3e56SJunchao Zhang PetscErrorCode ierr; 422152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423152b3e56SJunchao Zhang 424152b3e56SJunchao Zhang PetscFunctionBegin; 425152b3e56SJunchao Zhang switch (op) { 426152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 427152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 430152b3e56SJunchao Zhang break; 431152b3e56SJunchao Zhang default: 432152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433152b3e56SJunchao Zhang break; 434152b3e56SJunchao Zhang } 4358c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4368c3ff71bSJunchao Zhang } 4378c3ff71bSJunchao Zhang 438076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4393d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4408c3ff71bSJunchao Zhang { 4418c3ff71bSJunchao Zhang PetscErrorCode ierr; 442076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4438c3ff71bSJunchao Zhang 4448c3ff71bSJunchao Zhang PetscFunctionBegin; 445a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4488c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4512c71b3e2SJacob Faibussowitsch PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 457394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4582c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4608c3ff71bSJunchao Zhang } 461076ba34aSJunchao Zhang } 4628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4638c3ff71bSJunchao Zhang } 4648c3ff71bSJunchao Zhang 465076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467076ba34aSJunchao Zhang */ 468076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4698c3ff71bSJunchao Zhang { 4708c3ff71bSJunchao Zhang PetscErrorCode ierr; 471076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 472076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473076ba34aSJunchao Zhang Mat mat; 4748c3ff71bSJunchao Zhang 4758c3ff71bSJunchao Zhang PetscFunctionBegin; 476076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478076ba34aSJunchao Zhang mat = *B; 479076ba34aSJunchao Zhang if (A->assembled) { 480076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483076ba34aSJunchao Zhang /* Now copy values to B if needed */ 484076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 485076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 486076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 488076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 489076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 491076ba34aSJunchao Zhang } 492076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 495076ba34aSJunchao Zhang } 496076ba34aSJunchao Zhang mat->spptr = bkok; 497076ba34aSJunchao Zhang } 498076ba34aSJunchao Zhang 499076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 5038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5048c3ff71bSJunchao Zhang } 5058c3ff71bSJunchao Zhang 5060ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5070ecb592aSJunchao Zhang { 5080ecb592aSJunchao Zhang PetscErrorCode ierr; 5090ecb592aSJunchao Zhang Mat At; 510ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5110ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5120ecb592aSJunchao Zhang 5130ecb592aSJunchao Zhang PetscFunctionBegin; 5140ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5150ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 516ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 517ff751488SJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 5180ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5190ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5207a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5210ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5220ecb592aSJunchao Zhang if ((*B)->assembled) { 5230ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5240ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5250ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5260ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5270ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5280ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5290ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5300ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5310ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5320ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5330ecb592aSJunchao Zhang } 5340ecb592aSJunchao Zhang PetscFunctionReturn(0); 5350ecb592aSJunchao Zhang } 5360ecb592aSJunchao Zhang 5378c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5388c3ff71bSJunchao Zhang { 5398c3ff71bSJunchao Zhang PetscErrorCode ierr; 54086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5418c3ff71bSJunchao Zhang 5428c3ff71bSJunchao Zhang PetscFunctionBegin; 54386a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54486a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5458c3ff71bSJunchao Zhang delete aijkok; 54686a27549SJunchao Zhang } else { 54786a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 54886a27549SJunchao Zhang } 549*cbc6b225SStefano Zampini A->spptr = NULL; 550152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 55142550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 55242550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 5538c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5548c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5558c3ff71bSJunchao Zhang } 5568c3ff71bSJunchao Zhang 5573f3ba80aSJunchao Zhang /*MC 5583f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5593f3ba80aSJunchao Zhang 5603f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5613f3ba80aSJunchao Zhang 5623f3ba80aSJunchao Zhang Options Database Keys: 5633f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 5643f3ba80aSJunchao Zhang 5653f3ba80aSJunchao Zhang Level: beginner 5663f3ba80aSJunchao Zhang 5673f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 5683f3ba80aSJunchao Zhang M*/ 56986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 57086a27549SJunchao Zhang { 57186a27549SJunchao Zhang PetscErrorCode ierr; 57286a27549SJunchao Zhang 57386a27549SJunchao Zhang PetscFunctionBegin; 57486a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 57586a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 57686a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 57786a27549SJunchao Zhang PetscFunctionReturn(0); 57886a27549SJunchao Zhang } 57986a27549SJunchao Zhang 580076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582a3f881fbSStefano Zampini { 583076ba34aSJunchao Zhang PetscErrorCode ierr; 584076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 585076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 586076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 587076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 588076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 589076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 590a3f881fbSStefano Zampini 591a3f881fbSStefano Zampini PetscFunctionBegin; 592076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 593076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 594076ba34aSJunchao Zhang PetscValidPointer(C,4); 595076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 596076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 5972c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 5982c71b3e2SJacob Faibussowitsch PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 599076ba34aSJunchao Zhang 600076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 601076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 602076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 603076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 604076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 605076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 606076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 607076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 608076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 609076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 610076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 611076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 612076ba34aSJunchao Zhang nnz = a->nz + b->nz; 613076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 614076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 615076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 616076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 617076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 618076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 619076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 620076ba34aSJunchao Zhang ca = ca_dual.view_device(); 621076ba34aSJunchao Zhang ci = ci_dual.view_device(); 622076ba34aSJunchao Zhang cj = cj_dual.view_device(); 623076ba34aSJunchao Zhang 624076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 625076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 626076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 627076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 628076ba34aSJunchao Zhang 629076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 630076ba34aSJunchao Zhang ci(i) = coffset; 631076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 632076ba34aSJunchao Zhang }); 633076ba34aSJunchao Zhang 634076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 635076ba34aSJunchao Zhang if (k < alen) { 636076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 637076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 638076ba34aSJunchao Zhang } else { 639076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 640076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 641076ba34aSJunchao Zhang } 642076ba34aSJunchao Zhang }); 643076ba34aSJunchao Zhang }); 644076ba34aSJunchao Zhang ca_dual.modify_device(); 645076ba34aSJunchao Zhang ci_dual.modify_device(); 646076ba34aSJunchao Zhang cj_dual.modify_device(); 647076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 648076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 649076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 650076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 651076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 652076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 653076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 654076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 655076ba34aSJunchao Zhang 656076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 657076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 658076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 659076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 660076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 661076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 662076ba34aSJunchao Zhang }); 663076ba34aSJunchao Zhang }); 664076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 665076ba34aSJunchao Zhang } 666076ba34aSJunchao Zhang PetscFunctionReturn(0); 667076ba34aSJunchao Zhang } 668076ba34aSJunchao Zhang 669076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 670076ba34aSJunchao Zhang { 671076ba34aSJunchao Zhang PetscFunctionBegin; 672076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 673a3f881fbSStefano Zampini PetscFunctionReturn(0); 674a3f881fbSStefano Zampini } 675a3f881fbSStefano Zampini 676a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 677a3f881fbSStefano Zampini { 678076ba34aSJunchao Zhang PetscErrorCode ierr; 679a3f881fbSStefano Zampini Mat_Product *product = C->product; 680a3f881fbSStefano Zampini Mat A,B; 681076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 682a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 683a3f881fbSStefano Zampini Mat_SeqAIJ *c; 684076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 685076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 686a3f881fbSStefano Zampini 687a3f881fbSStefano Zampini PetscFunctionBegin; 688a3f881fbSStefano Zampini MatCheckProduct(C,1); 6892c71b3e2SJacob Faibussowitsch PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 690076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 691076ba34aSJunchao Zhang 692076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 693076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 694076ba34aSJunchao Zhang PetscFunctionReturn(0); 695076ba34aSJunchao Zhang } 696076ba34aSJunchao Zhang 697076ba34aSJunchao Zhang switch (product->type) { 698076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 699076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 700076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 701076ba34aSJunchao Zhang default: 70298921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 703076ba34aSJunchao Zhang } 704076ba34aSJunchao Zhang 705a3f881fbSStefano Zampini A = product->A; 706a3f881fbSStefano Zampini B = product->B; 707a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 708a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 709a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 710a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 711a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 712076ba34aSJunchao Zhang 7132c71b3e2SJacob Faibussowitsch PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 714076ba34aSJunchao Zhang 715076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 716076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 717076ba34aSJunchao Zhang 718076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 719076ba34aSJunchao Zhang if (transA) { 720076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 721076ba34aSJunchao Zhang transA = false; 722a3f881fbSStefano Zampini } 723a3f881fbSStefano Zampini 724076ba34aSJunchao Zhang if (transB) { 725076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 726076ba34aSJunchao Zhang transB = false; 727076ba34aSJunchao Zhang } 728eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 729076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 730076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 731eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 732076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 733a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 734a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 7357d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 736a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 7377d3de750SJacob Faibussowitsch ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 738a3f881fbSStefano Zampini c->reallocs = 0; 739076ba34aSJunchao Zhang C->info.mallocs = 0; 740a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 741a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 742a3f881fbSStefano Zampini C->num_ass++; 743a3f881fbSStefano Zampini PetscFunctionReturn(0); 744a3f881fbSStefano Zampini } 745a3f881fbSStefano Zampini 746a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 747a3f881fbSStefano Zampini { 748a3f881fbSStefano Zampini PetscErrorCode ierr; 749076ba34aSJunchao Zhang Mat_Product *product = C->product; 750076ba34aSJunchao Zhang MatProductType ptype; 751076ba34aSJunchao Zhang Mat A,B; 752076ba34aSJunchao Zhang bool transA,transB; 753076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 754076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 755076ba34aSJunchao Zhang MPI_Comm comm; 756076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 757a3f881fbSStefano Zampini 758a3f881fbSStefano Zampini PetscFunctionBegin; 759a3f881fbSStefano Zampini MatCheckProduct(C,1); 760076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 7612c71b3e2SJacob Faibussowitsch PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 762a3f881fbSStefano Zampini A = product->A; 763a3f881fbSStefano Zampini B = product->B; 764a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 765a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 766a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 767a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 768076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 769076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 770076ba34aSJunchao Zhang 771a3f881fbSStefano Zampini ptype = product->type; 772a3f881fbSStefano Zampini switch (ptype) { 773076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 774076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 775076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 776a3f881fbSStefano Zampini default: 77798921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 778a3f881fbSStefano Zampini } 779a3f881fbSStefano Zampini 780076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 781076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 782076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 783076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 784a3f881fbSStefano Zampini 785076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 786076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7876ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7886ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7896ffb9508SJunchao Zhang */ 790076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 791076ba34aSJunchao Zhang 792eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 793076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 794076ba34aSJunchao Zhang if (transA) { 795076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 796076ba34aSJunchao Zhang transA = false; 797076ba34aSJunchao Zhang } 798076ba34aSJunchao Zhang 799076ba34aSJunchao Zhang if (transB) { 800076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 801076ba34aSJunchao Zhang transB = false; 802076ba34aSJunchao Zhang } 803076ba34aSJunchao Zhang 804076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 805076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 806076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 807076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 808076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 809076ba34aSJunchao Zhang */ 810076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 811076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 812eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 813076ba34aSJunchao Zhang 814076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 815076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 816076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 817a3f881fbSStefano Zampini PetscFunctionReturn(0); 818a3f881fbSStefano Zampini } 819a3f881fbSStefano Zampini 820a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 821076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 822a3f881fbSStefano Zampini { 823a3f881fbSStefano Zampini PetscErrorCode ierr; 824076ba34aSJunchao Zhang Mat_Product *product = mat->product; 825a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 826a3f881fbSStefano Zampini 827a3f881fbSStefano Zampini PetscFunctionBegin; 828a3f881fbSStefano Zampini MatCheckProduct(mat,1); 829a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 830a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 831a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 832a3f881fbSStefano Zampini } 833a3f881fbSStefano Zampini if (Biskok && Ciskok) { 834a3f881fbSStefano Zampini switch (product->type) { 835a3f881fbSStefano Zampini case MATPRODUCT_AB: 836a3f881fbSStefano Zampini case MATPRODUCT_AtB: 837a3f881fbSStefano Zampini case MATPRODUCT_ABt: 838a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 839a3f881fbSStefano Zampini break; 840a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 841a3f881fbSStefano Zampini case MATPRODUCT_RARt: 842a3f881fbSStefano Zampini case MATPRODUCT_ABC: 843a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 844a3f881fbSStefano Zampini break; 845a3f881fbSStefano Zampini default: 846a3f881fbSStefano Zampini break; 847a3f881fbSStefano Zampini } 848a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 849a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 850a3f881fbSStefano Zampini } 851a3f881fbSStefano Zampini PetscFunctionReturn(0); 852a3f881fbSStefano Zampini } 853a587d139SMark 854f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 855f0cf5187SStefano Zampini { 856f0cf5187SStefano Zampini PetscErrorCode ierr; 857f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 858f0cf5187SStefano Zampini 859f0cf5187SStefano Zampini PetscFunctionBegin; 860eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 861f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 862f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 865076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 8666af1d01cSJed Brown ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 867f0cf5187SStefano Zampini PetscFunctionReturn(0); 868f0cf5187SStefano Zampini } 869f0cf5187SStefano Zampini 870a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871a587d139SMark { 872a587d139SMark PetscErrorCode ierr; 873076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 874a587d139SMark 875a587d139SMark PetscFunctionBegin; 876076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8772328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 878076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 879076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8802328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8812328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8822328674fSJunchao Zhang } 883a587d139SMark PetscFunctionReturn(0); 884a587d139SMark } 885a587d139SMark 886db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 88742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 888db78de30SJunchao Zhang { 889db78de30SJunchao Zhang PetscErrorCode ierr; 890db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 891db78de30SJunchao Zhang 892db78de30SJunchao Zhang PetscFunctionBegin; 893db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 894db78de30SJunchao Zhang PetscValidPointer(kv,2); 895db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 896db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 897db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 898076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 899db78de30SJunchao Zhang PetscFunctionReturn(0); 900db78de30SJunchao Zhang } 901db78de30SJunchao Zhang 90242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 903db78de30SJunchao Zhang { 904db78de30SJunchao Zhang PetscFunctionBegin; 905db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 906db78de30SJunchao Zhang PetscValidPointer(kv,2); 907db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 908db78de30SJunchao Zhang PetscFunctionReturn(0); 909db78de30SJunchao Zhang } 910db78de30SJunchao Zhang 91142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 912db78de30SJunchao Zhang { 913db78de30SJunchao Zhang PetscErrorCode ierr; 914db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 915db78de30SJunchao Zhang 916db78de30SJunchao Zhang PetscFunctionBegin; 917db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 918db78de30SJunchao Zhang PetscValidPointer(kv,2); 919db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 920db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 921db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 922076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 923db78de30SJunchao Zhang PetscFunctionReturn(0); 924db78de30SJunchao Zhang } 925db78de30SJunchao Zhang 92642550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 927db78de30SJunchao Zhang { 928db78de30SJunchao Zhang PetscErrorCode ierr; 929db78de30SJunchao Zhang 930db78de30SJunchao Zhang PetscFunctionBegin; 931db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 932db78de30SJunchao Zhang PetscValidPointer(kv,2); 933db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 934076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 935db78de30SJunchao Zhang PetscFunctionReturn(0); 936db78de30SJunchao Zhang } 937db78de30SJunchao Zhang 93842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 939db78de30SJunchao Zhang { 940db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 941db78de30SJunchao Zhang 942db78de30SJunchao Zhang PetscFunctionBegin; 943db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 944db78de30SJunchao Zhang PetscValidPointer(kv,2); 945db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 946db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 947076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 948db78de30SJunchao Zhang PetscFunctionReturn(0); 949db78de30SJunchao Zhang } 950db78de30SJunchao Zhang 95142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 952db78de30SJunchao Zhang { 953db78de30SJunchao Zhang PetscErrorCode ierr; 954db78de30SJunchao Zhang 955db78de30SJunchao Zhang PetscFunctionBegin; 956db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 957db78de30SJunchao Zhang PetscValidPointer(kv,2); 958db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 959076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 960db78de30SJunchao Zhang PetscFunctionReturn(0); 961db78de30SJunchao Zhang } 962db78de30SJunchao Zhang 963c17cf699SJunchao Zhang /* Computes Y += alpha X */ 964c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 965a587d139SMark { 966a587d139SMark PetscErrorCode ierr; 967a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 968c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 969c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 970c17cf699SJunchao Zhang MatScalarKokkosView Ya; 971a587d139SMark 972a587d139SMark PetscFunctionBegin; 973c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 974c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 975c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 976c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 977eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 978db78de30SJunchao Zhang 979c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 980c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 981a587d139SMark PetscBool e; 982a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 983a587d139SMark if (e) { 984a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 985c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 986a587d139SMark } 987a587d139SMark } 988db78de30SJunchao Zhang 989c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 990c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 991c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 992c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 993c17cf699SJunchao Zhang */ 994c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 995c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 996c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 997c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 998c17cf699SJunchao Zhang 999c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1000c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 1001c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1002c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1003c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1004c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1005c17cf699SJunchao Zhang 1006c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1007c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1008c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1009c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 1010c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1011c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1012c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1013c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1014c17cf699SJunchao Zhang q++; 1015a587d139SMark } else { 1016c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1017c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1018c17cf699SJunchao Zhang */ 1019c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1020a587d139SMark } 1021c17cf699SJunchao Zhang } 1022c17cf699SJunchao Zhang }); 1023c17cf699SJunchao Zhang }); 1024c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1025c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1026c17cf699SJunchao Zhang Mat Z; 1027c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1028c17cf699SJunchao Zhang KernelHandle kh; 1029c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1030c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1031c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1032c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1033c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1034c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1035c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1036c17cf699SJunchao Zhang } 1037eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1038eeadb341SJunchao Zhang ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1039a587d139SMark PetscFunctionReturn(0); 1040a587d139SMark } 1041a587d139SMark 1042394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 104342550becSJunchao Zhang { 104442550becSJunchao Zhang PetscErrorCode ierr; 104542550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 104642550becSJunchao Zhang Mat_SeqAIJ *aseq; 104742550becSJunchao Zhang 104842550becSJunchao Zhang PetscFunctionBegin; 1049*cbc6b225SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1050394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 105142550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1052*cbc6b225SStefano Zampini delete akok; 1053*cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1054*cbc6b225SStefano Zampini ierr = MatZeroEntries_SeqAIJKokkos(mat);CHKERRQ(ierr); 1055394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 105642550becSJunchao Zhang PetscFunctionReturn(0); 105742550becSJunchao Zhang } 105842550becSJunchao Zhang 105942550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 106042550becSJunchao Zhang { 106142550becSJunchao Zhang PetscErrorCode ierr; 106242550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 106342550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1064394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1065394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1066394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 106742550becSJunchao Zhang MatScalarKokkosView Aa; 106842550becSJunchao Zhang ConstMatScalarKokkosView kv; 106942550becSJunchao Zhang PetscMemType memtype; 107042550becSJunchao Zhang 107142550becSJunchao Zhang PetscFunctionBegin; 107242550becSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 107342550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1074394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 107542550becSJunchao Zhang } else { 1076394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 107742550becSJunchao Zhang } 107842550becSJunchao Zhang 1079*cbc6b225SStefano Zampini if (imode == INSERT_VALUES) { 1080*cbc6b225SStefano Zampini ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 1081*cbc6b225SStefano Zampini Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1082*cbc6b225SStefano Zampini } else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */ 108342550becSJunchao Zhang 1084*cbc6b225SStefano Zampini Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1085394ed5ebSJunchao Zhang 1086394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} 1087394ed5ebSJunchao Zhang else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);} 108842550becSJunchao Zhang PetscFunctionReturn(0); 108942550becSJunchao Zhang } 109042550becSJunchao Zhang 109186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10928f7e8f9dSMark Adams { 10938f7e8f9dSMark Adams PetscErrorCode ierr; 10948f7e8f9dSMark Adams 10958f7e8f9dSMark Adams PetscFunctionBegin; 10968f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 10978f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 10988f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10998f7e8f9dSMark Adams PetscFunctionReturn(0); 11008f7e8f9dSMark Adams } 11018f7e8f9dSMark Adams 11028c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 11038c3ff71bSJunchao Zhang { 110442550becSJunchao Zhang PetscErrorCode ierr; 1105076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1106076ba34aSJunchao Zhang 11078c3ff71bSJunchao Zhang PetscFunctionBegin; 1108076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 11096f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 11106f3d89d0SStefano Zampini 11118c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 11128c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 11138c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1114a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1115f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1116a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1117076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11188c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11198c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11208c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11218c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11228c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11238c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1124076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11250ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1126152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1127076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1128076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1129076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1130076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1131076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1132076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 113342550becSJunchao Zhang 113442550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 113542550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1136076ba34aSJunchao Zhang PetscFunctionReturn(0); 1137076ba34aSJunchao Zhang } 1138076ba34aSJunchao Zhang 1139076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1140076ba34aSJunchao Zhang { 1141076ba34aSJunchao Zhang PetscErrorCode ierr; 1142076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1143076ba34aSJunchao Zhang PetscInt i,m,n; 1144076ba34aSJunchao Zhang 1145076ba34aSJunchao Zhang PetscFunctionBegin; 11462c71b3e2SJacob Faibussowitsch PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1147076ba34aSJunchao Zhang 1148076ba34aSJunchao Zhang m = akok->nrows(); 1149076ba34aSJunchao Zhang n = akok->ncols(); 1150076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1151076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1152076ba34aSJunchao Zhang 1153076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1154076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1155076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1156076ba34aSJunchao Zhang 1157076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1158076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1159076ba34aSJunchao Zhang 1160076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1161076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1162076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1163076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1164076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1165076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1166076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1167076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1168076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1169076ba34aSJunchao Zhang 1170076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1171076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1172076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1173076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1174076ba34aSJunchao Zhang } 1175076ba34aSJunchao Zhang 1176076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1177076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1178ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1179076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1180076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1181076ba34aSJunchao Zhang PetscFunctionReturn(0); 1182076ba34aSJunchao Zhang } 1183076ba34aSJunchao Zhang 1184076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1185076ba34aSJunchao Zhang 1186076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1187076ba34aSJunchao Zhang */ 1188076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1189076ba34aSJunchao Zhang { 1190076ba34aSJunchao Zhang PetscErrorCode ierr; 1191076ba34aSJunchao Zhang 1192076ba34aSJunchao Zhang PetscFunctionBegin; 1193076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1194076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 11958c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11968c3ff71bSJunchao Zhang } 11978c3ff71bSJunchao Zhang 11988c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1199152b3e56SJunchao Zhang /*@C 12008c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 12018c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12028c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 12038c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12048c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12058c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 12068c3ff71bSJunchao Zhang 12078c3ff71bSJunchao Zhang Collective 12088c3ff71bSJunchao Zhang 12098c3ff71bSJunchao Zhang Input Parameters: 12108c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 12118c3ff71bSJunchao Zhang . m - number of rows 12128c3ff71bSJunchao Zhang . n - number of columns 12138c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 12148c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 12158c3ff71bSJunchao Zhang (possibly different for each row) or NULL 12168c3ff71bSJunchao Zhang 12178c3ff71bSJunchao Zhang Output Parameter: 12188c3ff71bSJunchao Zhang . A - the matrix 12198c3ff71bSJunchao Zhang 12208c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12218c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12228c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12238c3ff71bSJunchao Zhang 12248c3ff71bSJunchao Zhang Notes: 12258c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12268c3ff71bSJunchao Zhang 12278c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12288c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12298c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12308c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12318c3ff71bSJunchao Zhang 12328c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12338c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12348c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12358c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12368c3ff71bSJunchao Zhang 12378c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12388c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12398c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12408c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12418c3ff71bSJunchao Zhang 12428c3ff71bSJunchao Zhang Level: intermediate 12438c3ff71bSJunchao Zhang 1244076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 12458c3ff71bSJunchao Zhang @*/ 12468c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12478c3ff71bSJunchao Zhang { 12488c3ff71bSJunchao Zhang PetscErrorCode ierr; 12498c3ff71bSJunchao Zhang 12508c3ff71bSJunchao Zhang PetscFunctionBegin; 12518c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 12528c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 12538c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 12548c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 12558c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 12568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12578c3ff71bSJunchao Zhang } 1258930e68a5SMark Adams 12598f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12608f7e8f9dSMark Adams // 126146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12628f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12638f7e8f9dSMark Adams // 12648f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1265930e68a5SMark Adams { 12668f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12678f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12688f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1269930e68a5SMark Adams PetscErrorCode ierr; 12708f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1271300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1272076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1273076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12748f7e8f9dSMark Adams PetscBool row_identity,col_identity; 127546804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1276930e68a5SMark Adams 1277930e68a5SMark Adams PetscFunctionBegin; 12782c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12798f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 12802c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12818f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12828f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12838f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12848f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12858f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12868f7e8f9dSMark Adams { 12878f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12888f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12898f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12908f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12918f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12928f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12938f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12948f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12958f7e8f9dSMark Adams size_t flops_h = 0.0; 12968f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12978f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12988f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12998f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 13008f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 13018f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 13028f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 13038f7e8f9dSMark Adams // Fill A --> fact 13048f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1305042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 13068f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 13078f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13088f7e8f9dSMark Adams // zero rows of B 13098f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13108f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 13118f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 13128f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 13138f7e8f9dSMark Adams /* zero (unfactored row) */ 13148f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 13158f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 13168f7e8f9dSMark Adams }); 13178f7e8f9dSMark Adams // copy A into B 13188f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13198f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 13208f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 13218f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 13228f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13238f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 13248f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13258f7e8f9dSMark Adams PetscScalar vala = av[j]; 13268f7e8f9dSMark Adams if (colb == rowb) { 13278f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13288f7e8f9dSMark Adams } else { 13298f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13308f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13318f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 13328f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 13338f7e8f9dSMark Adams if (pbj[j] == colb) { 13348f7e8f9dSMark Adams pba[j] = vala; 13358f7e8f9dSMark Adams set++; 13368f7e8f9dSMark Adams break; 13378f7e8f9dSMark Adams } 13388f7e8f9dSMark Adams } 13398f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13408f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13418f1da0b2SJunchao Zhang #endif 13428f7e8f9dSMark Adams } 13438f7e8f9dSMark Adams } 13448f7e8f9dSMark Adams }); 13458f7e8f9dSMark Adams }); 13468f7e8f9dSMark Adams Kokkos::fence(); 1347930e68a5SMark Adams 13488f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13498f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13508f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13518f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1352042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13538f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 13548f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13558f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13568f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 13578f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 13588f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 13598f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 13608f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13618f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13628f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13638f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13648f7e8f9dSMark Adams else { 13658f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13668f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13678f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13688f7e8f9dSMark Adams size_t st_idx; 13698f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13708f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13718f7e8f9dSMark Adams // get column, there has got to be a better way 13728f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13738f7e8f9dSMark Adams if (pjL[j] == ii) { 13748f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13758f7e8f9dSMark Adams idx = j; // output 13768f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13778f7e8f9dSMark Adams } 13788f7e8f9dSMark Adams }, st_idx); 13798f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13808f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 138199551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 138299551766SMark Adams #endif 138399551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13848f7e8f9dSMark Adams // U(i+1,:end) 13858f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13868f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13878f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13888f7e8f9dSMark Adams if (col==myk) { 13898f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13908f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13918f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13928f7e8f9dSMark Adams } else { 13938f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13948f7e8f9dSMark Adams PetscInt high, low; 13958f7e8f9dSMark Adams const PetscInt *startj; 13968f7e8f9dSMark Adams if (col<myk) { // L 13978f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13988f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13998f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 14008f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 14018f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 14028f7e8f9dSMark Adams } else { 14038f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 14048f7e8f9dSMark Adams start = ba_d + idx; 14058f7e8f9dSMark Adams startj= bj_d + idx; 14068f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 14078f7e8f9dSMark Adams } 14088f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 14098f7e8f9dSMark Adams low = 0; 14108f7e8f9dSMark Adams high = (PetscInt)(end-start); 14118f7e8f9dSMark Adams while (high-low > 5) { 14128f7e8f9dSMark Adams int t = (low+high)/2; 14138f7e8f9dSMark Adams if (startj[t] > col) high = t; 14148f7e8f9dSMark Adams else low = t; 14158f7e8f9dSMark Adams } 14168f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 14178f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 14188f7e8f9dSMark Adams } 14198f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 142099551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 142199551766SMark Adams #endif 14228f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14238f7e8f9dSMark Adams } 14248f7e8f9dSMark Adams }); 14258f7e8f9dSMark Adams } 14268f7e8f9dSMark Adams }); 14278f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14288f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 14298f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 14308f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 14318f7e8f9dSMark Adams }); 14328f7e8f9dSMark Adams Kokkos::fence(); 14338f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 14348f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 14358f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 14368f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 14378f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 14388f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14398f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 14408f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14418f7e8f9dSMark Adams if (i < end) { 14428f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14438f7e8f9dSMark Adams *pv = 1.0/(*pv); 14448f7e8f9dSMark Adams } 14458f7e8f9dSMark Adams }); 14468f7e8f9dSMark Adams }); 14478f7e8f9dSMark Adams } 14488f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 14498f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 14508f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 14518f7e8f9dSMark Adams 14528f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 14538f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 14548f7e8f9dSMark Adams if (b->inode.size) { 14558f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14568f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14578f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14588f7e8f9dSMark Adams } else { 14598f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14608f7e8f9dSMark Adams } 14618f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14628f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 14638f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14648f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14658f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14668f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14678f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14688f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14698f7e8f9dSMark Adams 1470930e68a5SMark Adams PetscFunctionReturn(0); 1471930e68a5SMark Adams } 1472930e68a5SMark Adams 147386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1474930e68a5SMark Adams { 1475930e68a5SMark Adams PetscErrorCode ierr; 1476930e68a5SMark Adams 1477930e68a5SMark Adams PetscFunctionBegin; 1478930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 147986a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 148086a27549SJunchao Zhang PetscFunctionReturn(0); 148186a27549SJunchao Zhang } 148286a27549SJunchao Zhang 148386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 148486a27549SJunchao Zhang { 148586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 148686a27549SJunchao Zhang 148786a27549SJunchao Zhang PetscFunctionBegin; 148886a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 148986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 149086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 149186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 149286a27549SJunchao Zhang } 149386a27549SJunchao Zhang PetscFunctionReturn(0); 149486a27549SJunchao Zhang } 149586a27549SJunchao Zhang 149686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 149786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 149886a27549SJunchao Zhang { 149986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1500076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 150186a27549SJunchao Zhang 150286a27549SJunchao Zhang PetscFunctionBegin; 150386a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 150486a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1505076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 150686a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1507076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1508076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 150986a27549SJunchao Zhang 151086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1511076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1512076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1513076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 151486a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 151586a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 151686a27549SJunchao Zhang 151786a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 151886a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 151986a27549SJunchao Zhang */ 1520076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1521076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 152286a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 152386a27549SJunchao Zhang 152486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 152586a27549SJunchao Zhang 152686a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1527076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 152886a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1529076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1530076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 153186a27549SJunchao Zhang 153286a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1533076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1534076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1535076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 153686a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 153786a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 153886a27549SJunchao Zhang 153986a27549SJunchao Zhang /* Sort indices. See comments above */ 1540076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1541076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 154286a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 154386a27549SJunchao Zhang 154486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 154586a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 154686a27549SJunchao Zhang } 154786a27549SJunchao Zhang PetscFunctionReturn(0); 154886a27549SJunchao Zhang } 154986a27549SJunchao Zhang 155086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 155186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 155286a27549SJunchao Zhang { 155386a27549SJunchao Zhang PetscErrorCode ierr; 155486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 155586a27549SJunchao Zhang PetscScalarKokkosView xv; 155686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 155786a27549SJunchao Zhang 155886a27549SJunchao Zhang PetscFunctionBegin; 1559eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156086a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 156186a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1562ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 156386a27549SJunchao Zhang /* Solve L tmpv = b */ 15643f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 156586a27549SJunchao Zhang /* Solve Ux = tmpv */ 15663f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 156786a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1568ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1569eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 157086a27549SJunchao Zhang PetscFunctionReturn(0); 157186a27549SJunchao Zhang } 157286a27549SJunchao Zhang 1573076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 157486a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 157586a27549SJunchao Zhang { 157686a27549SJunchao Zhang PetscErrorCode ierr; 157786a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 157886a27549SJunchao Zhang PetscScalarKokkosView xv; 157986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 158086a27549SJunchao Zhang 158186a27549SJunchao Zhang PetscFunctionBegin; 1582eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 158386a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 158486a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1585ad7ac7b2SJunchao Zhang ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr); 158686a27549SJunchao Zhang /* Solve U^T tmpv = b */ 158786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 158886a27549SJunchao Zhang 158986a27549SJunchao Zhang /* Solve L^T x = tmpv */ 159086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 159186a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1592ad7ac7b2SJunchao Zhang ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr); 1593eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 159486a27549SJunchao Zhang PetscFunctionReturn(0); 159586a27549SJunchao Zhang } 159686a27549SJunchao Zhang 159786a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 159886a27549SJunchao Zhang { 159986a27549SJunchao Zhang PetscErrorCode ierr; 160086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 160186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 160286a27549SJunchao Zhang PetscInt fill_lev = info->levels; 160386a27549SJunchao Zhang 160486a27549SJunchao Zhang PetscFunctionBegin; 1605eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 160686a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1607076ba34aSJunchao Zhang 1608076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1609076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1610076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1611076ba34aSJunchao Zhang 1612076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 161386a27549SJunchao Zhang 161486a27549SJunchao Zhang B->assembled = PETSC_TRUE; 161586a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 161686a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 161786a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 161886a27549SJunchao Zhang B->ops->matsolve = NULL; 161986a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 162086a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162186a27549SJunchao Zhang 162286a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 162386a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 162486a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1625eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1626eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 162786a27549SJunchao Zhang PetscFunctionReturn(0); 162886a27549SJunchao Zhang } 162986a27549SJunchao Zhang 163086a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 163186a27549SJunchao Zhang { 163286a27549SJunchao Zhang PetscErrorCode ierr; 163386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 163486a27549SJunchao Zhang Mat_SeqAIJ *b; 163586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 163686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 163786a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 163886a27549SJunchao Zhang PetscInt n = A->rmap->n; 163986a27549SJunchao Zhang 164086a27549SJunchao Zhang PetscFunctionBegin; 164186a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 164286a27549SJunchao Zhang /* Rebuild factors */ 164386a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 164486a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 164586a27549SJunchao Zhang 164686a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 164786a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 164886a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 164986a27549SJunchao Zhang 165086a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 165186a27549SJunchao Zhang 165286a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 165386a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 165486a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 165586a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 165686a27549SJunchao Zhang 165786a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1658076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1659076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1660076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 166186a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 166286a27549SJunchao Zhang 166386a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 166486a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 166586a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 166686a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 166786a27549SJunchao Zhang 166886a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 166986a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 167086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 167186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 167286a27549SJunchao Zhang #else 167386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 167486a27549SJunchao Zhang #endif 167586a27549SJunchao Zhang 167686a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 167786a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 167886a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 167986a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 168086a27549SJunchao Zhang 168186a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 168286a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 168386a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 168486a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 168586a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 168686a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 168786a27549SJunchao Zhang 168886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 168986a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1690930e68a5SMark Adams PetscFunctionReturn(0); 1691930e68a5SMark Adams } 1692930e68a5SMark Adams 16938f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16948f7e8f9dSMark Adams { 16958f7e8f9dSMark Adams PetscErrorCode ierr; 16968f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16978f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1698930e68a5SMark Adams 16998f7e8f9dSMark Adams PetscFunctionBegin; 17008f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 17018f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17028f7e8f9dSMark Adams // move B data into Kokkos 17038f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 17048f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 17058f7e8f9dSMark Adams { 17068f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1707300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17088f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1709300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1710300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 17118f7e8f9dSMark Adams } 17128f7e8f9dSMark Adams } 17138f7e8f9dSMark Adams PetscFunctionReturn(0); 17148f7e8f9dSMark Adams } 17158f7e8f9dSMark Adams 171686a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1717930e68a5SMark Adams { 1718930e68a5SMark Adams PetscFunctionBegin; 1719930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1720930e68a5SMark Adams PetscFunctionReturn(0); 1721930e68a5SMark Adams } 1722930e68a5SMark Adams 17238f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 17248f7e8f9dSMark Adams { 17258f7e8f9dSMark Adams PetscFunctionBegin; 17268f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 17278f7e8f9dSMark Adams PetscFunctionReturn(0); 17288f7e8f9dSMark Adams } 17298f7e8f9dSMark Adams 1730930e68a5SMark Adams /*MC 173186a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 173286a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1733930e68a5SMark Adams 1734930e68a5SMark Adams Level: beginner 1735930e68a5SMark Adams 17363f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1737930e68a5SMark Adams M*/ 173886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1739930e68a5SMark Adams { 1740930e68a5SMark Adams PetscErrorCode ierr; 1741930e68a5SMark Adams PetscInt n = A->rmap->n; 1742930e68a5SMark Adams 1743930e68a5SMark Adams PetscFunctionBegin; 1744930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1745930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1746930e68a5SMark Adams (*B)->factortype = ftype; 17474ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1748930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1749930e68a5SMark Adams 17508f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1751930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 175286a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 175386a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 175486a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 175586a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 175686a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 175786a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 175898921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1759930e68a5SMark Adams 1760930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 176186a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1762930e68a5SMark Adams PetscFunctionReturn(0); 1763930e68a5SMark Adams } 17648f7e8f9dSMark Adams 17658f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17668f7e8f9dSMark Adams { 17678f7e8f9dSMark Adams PetscErrorCode ierr; 17688f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17698f7e8f9dSMark Adams 17708f7e8f9dSMark Adams PetscFunctionBegin; 17718f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 17728f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 17738f7e8f9dSMark Adams (*B)->factortype = ftype; 1774f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17754ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 17768f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17778f7e8f9dSMark Adams 17788f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17798f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17808f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17818f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17828f7e8f9dSMark Adams 17838f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17848f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17858f7e8f9dSMark Adams PetscFunctionReturn(0); 17868f7e8f9dSMark Adams } 178786a27549SJunchao Zhang 178886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 178986a27549SJunchao Zhang { 179086a27549SJunchao Zhang PetscErrorCode ierr; 179186a27549SJunchao Zhang 179286a27549SJunchao Zhang PetscFunctionBegin; 179386a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 179486a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 179586a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 179686a27549SJunchao Zhang PetscFunctionReturn(0); 179786a27549SJunchao Zhang } 179886a27549SJunchao Zhang 1799076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1800076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1801076ba34aSJunchao Zhang { 1802076ba34aSJunchao Zhang PetscErrorCode ierr; 1803076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1804076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1805076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1806076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1807076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1808076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1809076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1810076ba34aSJunchao Zhang 1811076ba34aSJunchao Zhang PetscFunctionBegin; 1812c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1813076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1814c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1815076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1816c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1817076ba34aSJunchao Zhang } 1818076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1819076ba34aSJunchao Zhang } 1820076ba34aSJunchao Zhang PetscFunctionReturn(0); 1821076ba34aSJunchao Zhang } 1822