111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 48c3ff71bSJunchao Zhang #include <petscsystypes.h> 58c3ff71bSJunchao Zhang #include <petscerror.h> 68c3ff71bSJunchao Zhang 78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 8f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 6186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 7986a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 842328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8586a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8686a27549SJunchao Zhang PetscFunctionReturn(0); 8786a27549SJunchao Zhang } 8886a27549SJunchao Zhang 89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90f0cf5187SStefano Zampini { 91f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92f0cf5187SStefano Zampini 93f0cf5187SStefano Zampini PetscFunctionBegin; 94f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9686a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 99f0cf5187SStefano Zampini PetscFunctionReturn(0); 100f0cf5187SStefano Zampini } 101f0cf5187SStefano Zampini 102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103f0cf5187SStefano Zampini { 104076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105f0cf5187SStefano Zampini 106f0cf5187SStefano Zampini PetscFunctionBegin; 107076ba34aSJunchao Zhang if (aijkok) { 108076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 109076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 110076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 111076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112076ba34aSJunchao Zhang } 113076ba34aSJunchao Zhang PetscFunctionReturn(0); 114076ba34aSJunchao Zhang } 115076ba34aSJunchao Zhang 116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117076ba34aSJunchao Zhang { 118076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119076ba34aSJunchao Zhang 120076ba34aSJunchao Zhang PetscFunctionBegin; 121076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 122076ba34aSJunchao Zhang PetscFunctionReturn(0); 123076ba34aSJunchao Zhang } 124076ba34aSJunchao Zhang 125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126076ba34aSJunchao Zhang { 127076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128076ba34aSJunchao Zhang 129076ba34aSJunchao Zhang PetscFunctionBegin; 1302328674fSJunchao Zhang if (aijkok) { 131076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 132076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1332328674fSJunchao Zhang } else { 1342328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1352328674fSJunchao Zhang } 136076ba34aSJunchao Zhang PetscFunctionReturn(0); 137076ba34aSJunchao Zhang } 138076ba34aSJunchao Zhang 139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140076ba34aSJunchao Zhang { 141076ba34aSJunchao Zhang PetscFunctionBegin; 142076ba34aSJunchao Zhang *array = NULL; 143076ba34aSJunchao Zhang PetscFunctionReturn(0); 144076ba34aSJunchao Zhang } 145076ba34aSJunchao Zhang 146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147076ba34aSJunchao Zhang { 148076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang PetscFunctionBegin; 1512328674fSJunchao Zhang if (aijkok) { 152076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1532328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1542328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1552328674fSJunchao Zhang } 156076ba34aSJunchao Zhang PetscFunctionReturn(0); 157076ba34aSJunchao Zhang } 158076ba34aSJunchao Zhang 159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160076ba34aSJunchao Zhang { 161076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162076ba34aSJunchao Zhang 163076ba34aSJunchao Zhang PetscFunctionBegin; 1642328674fSJunchao Zhang if (aijkok) { 165076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 166076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1672328674fSJunchao Zhang } 168f0cf5187SStefano Zampini PetscFunctionReturn(0); 169f0cf5187SStefano Zampini } 170f0cf5187SStefano Zampini 171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173a587d139SMark { 174042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176a587d139SMark 177a587d139SMark PetscFunctionBegin; 178076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181a587d139SMark PetscFunctionReturn(0); 182a587d139SMark } 183a587d139SMark 184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186a587d139SMark { 187042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188a587d139SMark 189a587d139SMark PetscFunctionBegin; 190a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 191a587d139SMark *d_mat = aijkok->device_mat_d.data(); 192a587d139SMark } else { 193a587d139SMark PetscErrorCode ierr; 194a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195a587d139SMark *d_mat = NULL; 196a587d139SMark } 197a587d139SMark PetscFunctionReturn(0); 198a587d139SMark } 199076ba34aSJunchao Zhang 200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202152b3e56SJunchao Zhang { 203152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204152b3e56SJunchao Zhang 205152b3e56SJunchao Zhang PetscFunctionBegin; 206076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 210076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 213076ba34aSJunchao Zhang } 214076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 215152b3e56SJunchao Zhang PetscFunctionReturn(0); 216152b3e56SJunchao Zhang } 217152b3e56SJunchao Zhang 218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220152b3e56SJunchao Zhang { 221eeadb341SJunchao Zhang PetscErrorCode ierr; 222152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 223152b3e56SJunchao Zhang 224152b3e56SJunchao Zhang PetscFunctionBegin; 225eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 226076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 227076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 228076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 229076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 230076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 231076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 232076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 233076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 234076ba34aSJunchao Zhang #endif 23586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 236076ba34aSJunchao Zhang } 237076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 238eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 239152b3e56SJunchao Zhang PetscFunctionReturn(0); 240152b3e56SJunchao Zhang } 241a587d139SMark 2428c3ff71bSJunchao Zhang /* y = A x */ 2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2448c3ff71bSJunchao Zhang { 2458c3ff71bSJunchao Zhang PetscErrorCode ierr; 2468c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 247152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 248152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang PetscFunctionBegin; 2518c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 252152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2548c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 255eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 256152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 257152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 258152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 259076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 260152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2618c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2628c3ff71bSJunchao Zhang } 2638c3ff71bSJunchao Zhang 2648c3ff71bSJunchao Zhang /* y = A^T x */ 2658c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2668c3ff71bSJunchao Zhang { 2678c3ff71bSJunchao Zhang PetscErrorCode ierr; 2688c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 269152b3e56SJunchao Zhang const char *mode; 270152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 271152b3e56SJunchao Zhang PetscScalarKokkosView yv; 272076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2738c3ff71bSJunchao Zhang 2748c3ff71bSJunchao Zhang PetscFunctionBegin; 275eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2768c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 277152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 278152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 279152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 280076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 281152b3e56SJunchao Zhang mode = "N"; 282152b3e56SJunchao Zhang } else { 283076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 284076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 285152b3e56SJunchao Zhang mode = "T"; 286152b3e56SJunchao Zhang } 287076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 288152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 289152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 290076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 291eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2928c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2938c3ff71bSJunchao Zhang } 2948c3ff71bSJunchao Zhang 2958c3ff71bSJunchao Zhang /* y = A^H x */ 2968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2978c3ff71bSJunchao Zhang { 2988c3ff71bSJunchao Zhang PetscErrorCode ierr; 2998c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 300152b3e56SJunchao Zhang const char *mode; 301152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 302152b3e56SJunchao Zhang PetscScalarKokkosView yv; 303076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3048c3ff71bSJunchao Zhang 3058c3ff71bSJunchao Zhang PetscFunctionBegin; 306eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3078c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 308152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 309152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 310152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 311076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 312152b3e56SJunchao Zhang mode = "N"; 313152b3e56SJunchao Zhang } else { 314076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 315076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 316152b3e56SJunchao Zhang mode = "C"; 317152b3e56SJunchao Zhang } 318076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 319152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 320152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 321076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 322eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3238c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3248c3ff71bSJunchao Zhang } 3258c3ff71bSJunchao Zhang 3268c3ff71bSJunchao Zhang /* z = A x + y */ 3278c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3288c3ff71bSJunchao Zhang { 3298c3ff71bSJunchao Zhang PetscErrorCode ierr; 3308c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 331152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 332152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3338c3ff71bSJunchao Zhang 3348c3ff71bSJunchao Zhang PetscFunctionBegin; 335eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3368c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 337152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3408c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3418c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 342152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 343152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 347eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3488c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3498c3ff71bSJunchao Zhang } 3508c3ff71bSJunchao Zhang 3518c3ff71bSJunchao Zhang /* z = A^T x + y */ 3528c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3538c3ff71bSJunchao Zhang { 3548c3ff71bSJunchao Zhang PetscErrorCode ierr; 3558c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 356152b3e56SJunchao Zhang const char *mode; 357152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 358152b3e56SJunchao Zhang PetscScalarKokkosView zv; 359076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3608c3ff71bSJunchao Zhang 3618c3ff71bSJunchao Zhang PetscFunctionBegin; 362eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3638c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 364152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 365152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 366152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3678c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 368152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 369076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 370152b3e56SJunchao Zhang mode = "N"; 371152b3e56SJunchao Zhang } else { 372076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 373076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 374152b3e56SJunchao Zhang mode = "T"; 375152b3e56SJunchao Zhang } 376076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 377152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 378152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 380076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 381eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3838c3ff71bSJunchao Zhang } 3848c3ff71bSJunchao Zhang 3858c3ff71bSJunchao Zhang /* z = A^H x + y */ 3868c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3878c3ff71bSJunchao Zhang { 3888c3ff71bSJunchao Zhang PetscErrorCode ierr; 3898c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 390152b3e56SJunchao Zhang const char *mode; 391152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 392152b3e56SJunchao Zhang PetscScalarKokkosView zv; 393076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3948c3ff71bSJunchao Zhang 3958c3ff71bSJunchao Zhang PetscFunctionBegin; 396eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3978c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 398152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 399152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 400152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 4018c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 402152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 403076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 404152b3e56SJunchao Zhang mode = "N"; 405152b3e56SJunchao Zhang } else { 406076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 407076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 408152b3e56SJunchao Zhang mode = "C"; 409152b3e56SJunchao Zhang } 410076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 411152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 412152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 413152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 414076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 415eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 416152b3e56SJunchao Zhang PetscFunctionReturn(0); 417152b3e56SJunchao Zhang } 418152b3e56SJunchao Zhang 419152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 420152b3e56SJunchao Zhang { 421152b3e56SJunchao Zhang PetscErrorCode ierr; 422152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 423152b3e56SJunchao Zhang 424152b3e56SJunchao Zhang PetscFunctionBegin; 425152b3e56SJunchao Zhang switch (op) { 426152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 427152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 428152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 429152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 430152b3e56SJunchao Zhang break; 431152b3e56SJunchao Zhang default: 432152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 433152b3e56SJunchao Zhang break; 434152b3e56SJunchao Zhang } 4358c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4368c3ff71bSJunchao Zhang } 4378c3ff71bSJunchao Zhang 438076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4393d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4408c3ff71bSJunchao Zhang { 4418c3ff71bSJunchao Zhang PetscErrorCode ierr; 442076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4438c3ff71bSJunchao Zhang 4448c3ff71bSJunchao Zhang PetscFunctionBegin; 445a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 446076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 447076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4488c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 449076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 450076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 451076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 452076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 453076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 454076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 455076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 456076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 457076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 458076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 459076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4608c3ff71bSJunchao Zhang } 461076ba34aSJunchao Zhang } 4628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4638c3ff71bSJunchao Zhang } 4648c3ff71bSJunchao Zhang 465076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 466076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 467076ba34aSJunchao Zhang */ 468076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4698c3ff71bSJunchao Zhang { 4708c3ff71bSJunchao Zhang PetscErrorCode ierr; 471076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 472076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 473076ba34aSJunchao Zhang Mat mat; 4748c3ff71bSJunchao Zhang 4758c3ff71bSJunchao Zhang PetscFunctionBegin; 476076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 477076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 478076ba34aSJunchao Zhang mat = *B; 479076ba34aSJunchao Zhang if (A->assembled) { 480076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 481076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 482076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 483076ba34aSJunchao Zhang /* Now copy values to B if needed */ 484076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 485076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 486076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 487076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 488076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 489076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 490076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 491076ba34aSJunchao Zhang } 492076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 493076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 494076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 495076ba34aSJunchao Zhang } 496076ba34aSJunchao Zhang mat->spptr = bkok; 497076ba34aSJunchao Zhang } 498076ba34aSJunchao Zhang 499076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 500076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 501076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 502076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 5038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5048c3ff71bSJunchao Zhang } 5058c3ff71bSJunchao Zhang 5060ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5070ecb592aSJunchao Zhang { 5080ecb592aSJunchao Zhang PetscErrorCode ierr; 5090ecb592aSJunchao Zhang Mat At; 5100ecb592aSJunchao Zhang KokkosCsrMatrix *internT,*csrmatT; 5110ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5120ecb592aSJunchao Zhang 5130ecb592aSJunchao Zhang PetscFunctionBegin; 5140ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5150ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 5160ecb592aSJunchao Zhang CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 5170ecb592aSJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 5180ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5190ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5207a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5210ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5220ecb592aSJunchao Zhang if ((*B)->assembled) { 5230ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5240ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5250ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5260ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5270ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5280ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5290ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5300ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5310ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5320ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5330ecb592aSJunchao Zhang } 5340ecb592aSJunchao Zhang PetscFunctionReturn(0); 5350ecb592aSJunchao Zhang } 5360ecb592aSJunchao Zhang 5378c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5388c3ff71bSJunchao Zhang { 5398c3ff71bSJunchao Zhang PetscErrorCode ierr; 54086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5418c3ff71bSJunchao Zhang 5428c3ff71bSJunchao Zhang PetscFunctionBegin; 54386a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54486a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5458f7e8f9dSMark Adams if (aijkok) { 5468f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 547a587d139SMark delete aijkok->colmap_d; 548a587d139SMark delete aijkok->i_uncompressed_d; 549a587d139SMark } 5508f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5518f7e8f9dSMark Adams } 5528c3ff71bSJunchao Zhang delete aijkok; 55386a27549SJunchao Zhang } else { 55486a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 55586a27549SJunchao Zhang } 556152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 557152b3e56SJunchao Zhang A->spptr = NULL; 5588c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5598c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5608c3ff71bSJunchao Zhang } 5618c3ff71bSJunchao Zhang 56286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 56386a27549SJunchao Zhang { 56486a27549SJunchao Zhang PetscErrorCode ierr; 56586a27549SJunchao Zhang 56686a27549SJunchao Zhang PetscFunctionBegin; 56786a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 56886a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 56986a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 57086a27549SJunchao Zhang PetscFunctionReturn(0); 57186a27549SJunchao Zhang } 57286a27549SJunchao Zhang 573076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 574076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 575a3f881fbSStefano Zampini { 576076ba34aSJunchao Zhang PetscErrorCode ierr; 577076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 578076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 579076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 580076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 581076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 582076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 583a3f881fbSStefano Zampini 584a3f881fbSStefano Zampini PetscFunctionBegin; 585076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 586076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 587076ba34aSJunchao Zhang PetscValidPointer(C,4); 588076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 589076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 590c0aa6a63SJacob Faibussowitsch if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 591076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 592076ba34aSJunchao Zhang 593076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 594076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 595076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 596076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 597076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 599076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 600076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 601076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 602076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 603076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 604076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 605076ba34aSJunchao Zhang nnz = a->nz + b->nz; 606076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 607076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 608076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 609076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 610076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 611076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 612076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 613076ba34aSJunchao Zhang ca = ca_dual.view_device(); 614076ba34aSJunchao Zhang ci = ci_dual.view_device(); 615076ba34aSJunchao Zhang cj = cj_dual.view_device(); 616076ba34aSJunchao Zhang 617076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 618076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 619076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 620076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 621076ba34aSJunchao Zhang 622076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 623076ba34aSJunchao Zhang ci(i) = coffset; 624076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 625076ba34aSJunchao Zhang }); 626076ba34aSJunchao Zhang 627076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 628076ba34aSJunchao Zhang if (k < alen) { 629076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 630076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 631076ba34aSJunchao Zhang } else { 632076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 633076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 634076ba34aSJunchao Zhang } 635076ba34aSJunchao Zhang }); 636076ba34aSJunchao Zhang }); 637076ba34aSJunchao Zhang ca_dual.modify_device(); 638076ba34aSJunchao Zhang ci_dual.modify_device(); 639076ba34aSJunchao Zhang cj_dual.modify_device(); 640076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 641076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 642076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 643076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 644076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 645076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 646076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 647076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 648076ba34aSJunchao Zhang 649076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 650076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 651076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 652076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 653076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 654076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 655076ba34aSJunchao Zhang }); 656076ba34aSJunchao Zhang }); 657076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 658076ba34aSJunchao Zhang } 659076ba34aSJunchao Zhang PetscFunctionReturn(0); 660076ba34aSJunchao Zhang } 661076ba34aSJunchao Zhang 662076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 663076ba34aSJunchao Zhang { 664076ba34aSJunchao Zhang PetscFunctionBegin; 665076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 666a3f881fbSStefano Zampini PetscFunctionReturn(0); 667a3f881fbSStefano Zampini } 668a3f881fbSStefano Zampini 669a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 670a3f881fbSStefano Zampini { 671076ba34aSJunchao Zhang PetscErrorCode ierr; 672a3f881fbSStefano Zampini Mat_Product *product = C->product; 673a3f881fbSStefano Zampini Mat A,B; 674076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 675a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 676a3f881fbSStefano Zampini Mat_SeqAIJ *c; 677076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 678076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 679a3f881fbSStefano Zampini 680a3f881fbSStefano Zampini PetscFunctionBegin; 681a3f881fbSStefano Zampini MatCheckProduct(C,1); 682a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 683076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 684076ba34aSJunchao Zhang 685076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 686076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 687076ba34aSJunchao Zhang PetscFunctionReturn(0); 688076ba34aSJunchao Zhang } 689076ba34aSJunchao Zhang 690076ba34aSJunchao Zhang switch (product->type) { 691076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 692076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 693076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 694076ba34aSJunchao Zhang default: 695076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 696076ba34aSJunchao Zhang } 697076ba34aSJunchao Zhang 698a3f881fbSStefano Zampini A = product->A; 699a3f881fbSStefano Zampini B = product->B; 700a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 701a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 702a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 703a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 704a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 705076ba34aSJunchao Zhang 706076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 707076ba34aSJunchao Zhang 708076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 709076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 710076ba34aSJunchao Zhang 711076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 712076ba34aSJunchao Zhang if (transA) { 713076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 714076ba34aSJunchao Zhang transA = false; 715a3f881fbSStefano Zampini } 716a3f881fbSStefano Zampini 717076ba34aSJunchao Zhang if (transB) { 718076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 719076ba34aSJunchao Zhang transB = false; 720076ba34aSJunchao Zhang } 721eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 722076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 723076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 724eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 725076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 726a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 727a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 728c0aa6a63SJacob Faibussowitsch ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 729a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 730c0aa6a63SJacob Faibussowitsch ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 731a3f881fbSStefano Zampini c->reallocs = 0; 732076ba34aSJunchao Zhang C->info.mallocs = 0; 733a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 734a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 735a3f881fbSStefano Zampini C->num_ass++; 736a3f881fbSStefano Zampini PetscFunctionReturn(0); 737a3f881fbSStefano Zampini } 738a3f881fbSStefano Zampini 739a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 740a3f881fbSStefano Zampini { 741a3f881fbSStefano Zampini PetscErrorCode ierr; 742076ba34aSJunchao Zhang Mat_Product *product = C->product; 743076ba34aSJunchao Zhang MatProductType ptype; 744076ba34aSJunchao Zhang Mat A,B; 745076ba34aSJunchao Zhang bool transA,transB; 746076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 747076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 748076ba34aSJunchao Zhang MPI_Comm comm; 749076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 750a3f881fbSStefano Zampini 751a3f881fbSStefano Zampini PetscFunctionBegin; 752a3f881fbSStefano Zampini MatCheckProduct(C,1); 753076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 754076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 755a3f881fbSStefano Zampini A = product->A; 756a3f881fbSStefano Zampini B = product->B; 757a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 758a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 759a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 760a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 761076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 762076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 763076ba34aSJunchao Zhang 764a3f881fbSStefano Zampini ptype = product->type; 765a3f881fbSStefano Zampini switch (ptype) { 766076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 767076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 768076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 769a3f881fbSStefano Zampini default: 770076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 771a3f881fbSStefano Zampini } 772a3f881fbSStefano Zampini 773076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 774076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 775076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 776076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 777a3f881fbSStefano Zampini 778076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 779076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7806ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7816ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7826ffb9508SJunchao Zhang */ 783076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 784076ba34aSJunchao Zhang 785eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 786076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 787076ba34aSJunchao Zhang if (transA) { 788076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 789076ba34aSJunchao Zhang transA = false; 790076ba34aSJunchao Zhang } 791076ba34aSJunchao Zhang 792076ba34aSJunchao Zhang if (transB) { 793076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 794076ba34aSJunchao Zhang transB = false; 795076ba34aSJunchao Zhang } 796076ba34aSJunchao Zhang 797076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 798076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 799076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 800076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 801076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 802076ba34aSJunchao Zhang */ 803076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 804076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 805eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 806076ba34aSJunchao Zhang 807076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 808076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 809076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 810a3f881fbSStefano Zampini PetscFunctionReturn(0); 811a3f881fbSStefano Zampini } 812a3f881fbSStefano Zampini 813a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 814076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 815a3f881fbSStefano Zampini { 816a3f881fbSStefano Zampini PetscErrorCode ierr; 817076ba34aSJunchao Zhang Mat_Product *product = mat->product; 818a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 819a3f881fbSStefano Zampini 820a3f881fbSStefano Zampini PetscFunctionBegin; 821a3f881fbSStefano Zampini MatCheckProduct(mat,1); 822a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 823a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 824a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 825a3f881fbSStefano Zampini } 826a3f881fbSStefano Zampini if (Biskok && Ciskok) { 827a3f881fbSStefano Zampini switch (product->type) { 828a3f881fbSStefano Zampini case MATPRODUCT_AB: 829a3f881fbSStefano Zampini case MATPRODUCT_AtB: 830a3f881fbSStefano Zampini case MATPRODUCT_ABt: 831a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 832a3f881fbSStefano Zampini break; 833a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 834a3f881fbSStefano Zampini case MATPRODUCT_RARt: 835a3f881fbSStefano Zampini case MATPRODUCT_ABC: 836a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 837a3f881fbSStefano Zampini break; 838a3f881fbSStefano Zampini default: 839a3f881fbSStefano Zampini break; 840a3f881fbSStefano Zampini } 841a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 842a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 843a3f881fbSStefano Zampini } 844a3f881fbSStefano Zampini PetscFunctionReturn(0); 845a3f881fbSStefano Zampini } 846a587d139SMark 847f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 848f0cf5187SStefano Zampini { 849f0cf5187SStefano Zampini PetscErrorCode ierr; 850f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 851f0cf5187SStefano Zampini 852f0cf5187SStefano Zampini PetscFunctionBegin; 853eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 854f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 855f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 856076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 857076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 858076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 859eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 860f0cf5187SStefano Zampini PetscFunctionReturn(0); 861f0cf5187SStefano Zampini } 862f0cf5187SStefano Zampini 863a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 864a587d139SMark { 865a587d139SMark PetscErrorCode ierr; 866076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 867a587d139SMark 868a587d139SMark PetscFunctionBegin; 869076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8702328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 871076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 872076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8732328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8742328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8752328674fSJunchao Zhang } 876a587d139SMark PetscFunctionReturn(0); 877a587d139SMark } 878a587d139SMark 879db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 880db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 881db78de30SJunchao Zhang { 882db78de30SJunchao Zhang PetscErrorCode ierr; 883db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 884db78de30SJunchao Zhang 885db78de30SJunchao Zhang PetscFunctionBegin; 886db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887db78de30SJunchao Zhang PetscValidPointer(kv,2); 888db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 890db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 892db78de30SJunchao Zhang PetscFunctionReturn(0); 893db78de30SJunchao Zhang } 894db78de30SJunchao Zhang 895db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 896db78de30SJunchao Zhang { 897db78de30SJunchao Zhang PetscFunctionBegin; 898db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899db78de30SJunchao Zhang PetscValidPointer(kv,2); 900db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901db78de30SJunchao Zhang PetscFunctionReturn(0); 902db78de30SJunchao Zhang } 903db78de30SJunchao Zhang 904db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 905db78de30SJunchao Zhang { 906db78de30SJunchao Zhang PetscErrorCode ierr; 907db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 908db78de30SJunchao Zhang 909db78de30SJunchao Zhang PetscFunctionBegin; 910db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 911db78de30SJunchao Zhang PetscValidPointer(kv,2); 912db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 913db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 914db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 915076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 916db78de30SJunchao Zhang PetscFunctionReturn(0); 917db78de30SJunchao Zhang } 918db78de30SJunchao Zhang 919db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 920db78de30SJunchao Zhang { 921db78de30SJunchao Zhang PetscErrorCode ierr; 922db78de30SJunchao Zhang 923db78de30SJunchao Zhang PetscFunctionBegin; 924db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 925db78de30SJunchao Zhang PetscValidPointer(kv,2); 926db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 927076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 928db78de30SJunchao Zhang PetscFunctionReturn(0); 929db78de30SJunchao Zhang } 930db78de30SJunchao Zhang 931db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 932db78de30SJunchao Zhang { 933db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 934db78de30SJunchao Zhang 935db78de30SJunchao Zhang PetscFunctionBegin; 936db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 937db78de30SJunchao Zhang PetscValidPointer(kv,2); 938db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 939db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 940076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 941db78de30SJunchao Zhang PetscFunctionReturn(0); 942db78de30SJunchao Zhang } 943db78de30SJunchao Zhang 944db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 945db78de30SJunchao Zhang { 946db78de30SJunchao Zhang PetscErrorCode ierr; 947db78de30SJunchao Zhang 948db78de30SJunchao Zhang PetscFunctionBegin; 949db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950db78de30SJunchao Zhang PetscValidPointer(kv,2); 951db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 953db78de30SJunchao Zhang PetscFunctionReturn(0); 954db78de30SJunchao Zhang } 955db78de30SJunchao Zhang 956c17cf699SJunchao Zhang /* Computes Y += alpha X */ 957c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958a587d139SMark { 959a587d139SMark PetscErrorCode ierr; 960a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 961c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 962c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 963c17cf699SJunchao Zhang MatScalarKokkosView Ya; 964a587d139SMark 965a587d139SMark PetscFunctionBegin; 966c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 967c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 968c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 969c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 970eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 971db78de30SJunchao Zhang 972c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 973c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 974a587d139SMark PetscBool e; 975a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 976a587d139SMark if (e) { 977a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 978c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 979a587d139SMark } 980a587d139SMark } 981db78de30SJunchao Zhang 982c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 983c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 984c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 985c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 986c17cf699SJunchao Zhang */ 987c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 988c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 989c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 990c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 991c17cf699SJunchao Zhang 992c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 993c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 994c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 995c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 996c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 997c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 998c17cf699SJunchao Zhang 999c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1000c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1001c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1002c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 1003c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1004c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1005c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1006c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1007c17cf699SJunchao Zhang q++; 1008a587d139SMark } else { 1009c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1010c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1011c17cf699SJunchao Zhang */ 1012c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1013a587d139SMark } 1014c17cf699SJunchao Zhang } 1015c17cf699SJunchao Zhang }); 1016c17cf699SJunchao Zhang }); 1017c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1018c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1019c17cf699SJunchao Zhang Mat Z; 1020c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1021c17cf699SJunchao Zhang KernelHandle kh; 1022c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1023c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1024c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1025c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1026c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1027c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1028c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1029c17cf699SJunchao Zhang } 1030eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1031eeadb341SJunchao Zhang ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1032a587d139SMark PetscFunctionReturn(0); 1033a587d139SMark } 1034a587d139SMark 103586a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10368f7e8f9dSMark Adams { 10378f7e8f9dSMark Adams PetscErrorCode ierr; 10388f7e8f9dSMark Adams 10398f7e8f9dSMark Adams PetscFunctionBegin; 10408f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 10418f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 10428f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10438f7e8f9dSMark Adams PetscFunctionReturn(0); 10448f7e8f9dSMark Adams } 10458f7e8f9dSMark Adams 10468c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10478c3ff71bSJunchao Zhang { 1048076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1049076ba34aSJunchao Zhang 10508c3ff71bSJunchao Zhang PetscFunctionBegin; 1051076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 10526f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 10536f3d89d0SStefano Zampini 10548c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10558c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10568c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1057a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1058f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1059a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1060076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 10618c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 10628c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 10638c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 10648c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 10658c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 10668c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1067076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 10680ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1069152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1070076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1071076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1072076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1073076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1074076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1075076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1076076ba34aSJunchao Zhang PetscFunctionReturn(0); 1077076ba34aSJunchao Zhang } 1078076ba34aSJunchao Zhang 1079076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1080076ba34aSJunchao Zhang { 1081076ba34aSJunchao Zhang PetscErrorCode ierr; 1082076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1083076ba34aSJunchao Zhang PetscInt i,m,n; 1084076ba34aSJunchao Zhang 1085076ba34aSJunchao Zhang PetscFunctionBegin; 1086076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1087076ba34aSJunchao Zhang 1088076ba34aSJunchao Zhang m = akok->nrows(); 1089076ba34aSJunchao Zhang n = akok->ncols(); 1090076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1091076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1092076ba34aSJunchao Zhang 1093076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1094076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1095076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1096076ba34aSJunchao Zhang 1097076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1098076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1099076ba34aSJunchao Zhang 1100076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1101076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1102076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1103076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1104076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1105076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1106076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1107076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1108076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1109076ba34aSJunchao Zhang 1110076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1111076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1112076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1113076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1114076ba34aSJunchao Zhang } 1115076ba34aSJunchao Zhang 1116076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1117076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1118076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1119076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1120076ba34aSJunchao Zhang A->spptr = akok; 1121076ba34aSJunchao Zhang PetscFunctionReturn(0); 1122076ba34aSJunchao Zhang } 1123076ba34aSJunchao Zhang 1124076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1125076ba34aSJunchao Zhang 1126076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1127076ba34aSJunchao Zhang */ 1128076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1129076ba34aSJunchao Zhang { 1130076ba34aSJunchao Zhang PetscErrorCode ierr; 1131076ba34aSJunchao Zhang 1132076ba34aSJunchao Zhang PetscFunctionBegin; 1133076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1134076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 11358c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11368c3ff71bSJunchao Zhang } 11378c3ff71bSJunchao Zhang 11388c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1139152b3e56SJunchao Zhang /*@C 11408c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11418c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11428c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11438c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 11448c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 11458c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 11468c3ff71bSJunchao Zhang 11478c3ff71bSJunchao Zhang Collective 11488c3ff71bSJunchao Zhang 11498c3ff71bSJunchao Zhang Input Parameters: 11508c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 11518c3ff71bSJunchao Zhang . m - number of rows 11528c3ff71bSJunchao Zhang . n - number of columns 11538c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 11548c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 11558c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11568c3ff71bSJunchao Zhang 11578c3ff71bSJunchao Zhang Output Parameter: 11588c3ff71bSJunchao Zhang . A - the matrix 11598c3ff71bSJunchao Zhang 11608c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 11618c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 11628c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 11638c3ff71bSJunchao Zhang 11648c3ff71bSJunchao Zhang Notes: 11658c3ff71bSJunchao Zhang If nnz is given then nz is ignored 11668c3ff71bSJunchao Zhang 11678c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 11688c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 11698c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 11708c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 11718c3ff71bSJunchao Zhang 11728c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 11738c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11748c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 11758c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 11768c3ff71bSJunchao Zhang 11778c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 11788c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 11798c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 11808c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 11818c3ff71bSJunchao Zhang 11828c3ff71bSJunchao Zhang Level: intermediate 11838c3ff71bSJunchao Zhang 1184076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 11858c3ff71bSJunchao Zhang @*/ 11868c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 11878c3ff71bSJunchao Zhang { 11888c3ff71bSJunchao Zhang PetscErrorCode ierr; 11898c3ff71bSJunchao Zhang 11908c3ff71bSJunchao Zhang PetscFunctionBegin; 11918c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 11928c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 11938c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 11948c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11958c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11968c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11978c3ff71bSJunchao Zhang } 1198930e68a5SMark Adams 11998f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12008f7e8f9dSMark Adams // 1201*46804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12028f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12038f7e8f9dSMark Adams // 12048f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1205930e68a5SMark Adams { 12068f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12078f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12088f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1209930e68a5SMark Adams PetscErrorCode ierr; 12108f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1211076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1212076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1213076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12148f7e8f9dSMark Adams PetscBool row_identity,col_identity; 1215*46804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1216930e68a5SMark Adams 1217930e68a5SMark Adams PetscFunctionBegin; 1218c0aa6a63SJacob Faibussowitsch if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 12198f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 12208f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 12218f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12228f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12238f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12248f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12258f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12268f7e8f9dSMark Adams { 12278f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12288f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12298f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12308f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12318f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12328f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12338f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12348f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12358f7e8f9dSMark Adams size_t flops_h = 0.0; 12368f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12378f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12388f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12398f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12408f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12418f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12428f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12438f7e8f9dSMark Adams // Fill A --> fact 12448f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1245042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12468f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12478f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12488f7e8f9dSMark Adams // zero rows of B 12498f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12508f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12518f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12528f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12538f7e8f9dSMark Adams /* zero (unfactored row) */ 12548f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12558f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12568f7e8f9dSMark Adams }); 12578f7e8f9dSMark Adams // copy A into B 12588f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12598f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12608f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12618f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12628f7e8f9dSMark Adams /* load in initial (unfactored row) */ 12638f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 12648f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 12658f7e8f9dSMark Adams PetscScalar vala = av[j]; 12668f7e8f9dSMark Adams if (colb == rowb) { 12678f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 12688f7e8f9dSMark Adams } else { 12698f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12708f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12718f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 12728f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 12738f7e8f9dSMark Adams if (pbj[j] == colb) { 12748f7e8f9dSMark Adams pba[j] = vala; 12758f7e8f9dSMark Adams set++; 12768f7e8f9dSMark Adams break; 12778f7e8f9dSMark Adams } 12788f7e8f9dSMark Adams } 12798f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 12808f7e8f9dSMark Adams } 12818f7e8f9dSMark Adams } 12828f7e8f9dSMark Adams }); 12838f7e8f9dSMark Adams }); 12848f7e8f9dSMark Adams Kokkos::fence(); 1285930e68a5SMark Adams 12868f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 12878f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12888f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12898f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1290042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 12918f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 12928f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 12938f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 12948f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 12958f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 12968f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 12978f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 12988f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 12998f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 13008f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 13018f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 13028f7e8f9dSMark Adams else { 13038f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13048f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13058f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 13068f7e8f9dSMark Adams size_t st_idx; 13078f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13088f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13098f7e8f9dSMark Adams // get column, there has got to be a better way 13108f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 13118f7e8f9dSMark Adams if (pjL[j] == ii) { 13128f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13138f7e8f9dSMark Adams idx = j; // output 13148f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13158f7e8f9dSMark Adams } 13168f7e8f9dSMark Adams }, st_idx); 13178f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 131899551766SMark Adams #if defined(PETSC_USE_DEBUG) 131999551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 132099551766SMark Adams #endif 132199551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13228f7e8f9dSMark Adams // U(i+1,:end) 13238f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13248f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13258f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13268f7e8f9dSMark Adams if (col==myk) { 13278f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13288f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13298f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13308f7e8f9dSMark Adams } else { 13318f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13328f7e8f9dSMark Adams PetscInt high, low; 13338f7e8f9dSMark Adams const PetscInt *startj; 13348f7e8f9dSMark Adams if (col<myk) { // L 13358f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13368f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13378f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13388f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13398f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13408f7e8f9dSMark Adams } else { 13418f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 13428f7e8f9dSMark Adams start = ba_d + idx; 13438f7e8f9dSMark Adams startj= bj_d + idx; 13448f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13458f7e8f9dSMark Adams } 13468f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13478f7e8f9dSMark Adams low = 0; 13488f7e8f9dSMark Adams high = (PetscInt)(end-start); 13498f7e8f9dSMark Adams while (high-low > 5) { 13508f7e8f9dSMark Adams int t = (low+high)/2; 13518f7e8f9dSMark Adams if (startj[t] > col) high = t; 13528f7e8f9dSMark Adams else low = t; 13538f7e8f9dSMark Adams } 13548f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13558f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13568f7e8f9dSMark Adams } 135799551766SMark Adams #if defined(PETSC_USE_DEBUG) 135899551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 135999551766SMark Adams #endif 13608f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 13618f7e8f9dSMark Adams } 13628f7e8f9dSMark Adams }); 13638f7e8f9dSMark Adams } 13648f7e8f9dSMark Adams }); 13658f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 13668f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 13678f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 13688f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 13698f7e8f9dSMark Adams }); 13708f7e8f9dSMark Adams Kokkos::fence(); 13718f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 13728f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13738f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 13748f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 13758f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 13768f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 13778f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 13788f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 13798f7e8f9dSMark Adams if (i < end) { 13808f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 13818f7e8f9dSMark Adams *pv = 1.0/(*pv); 13828f7e8f9dSMark Adams } 13838f7e8f9dSMark Adams }); 13848f7e8f9dSMark Adams }); 13858f7e8f9dSMark Adams } 13868f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 13878f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 13888f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 13898f7e8f9dSMark Adams 13908f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 13918f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 13928f7e8f9dSMark Adams if (b->inode.size) { 13938f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 13948f7e8f9dSMark Adams } else if (row_identity && col_identity) { 13958f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 13968f7e8f9dSMark Adams } else { 13978f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 13988f7e8f9dSMark Adams } 13998f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14008f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 14018f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14028f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14038f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14048f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14058f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14068f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 14078f7e8f9dSMark Adams 1408930e68a5SMark Adams PetscFunctionReturn(0); 1409930e68a5SMark Adams } 1410930e68a5SMark Adams 141186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1412930e68a5SMark Adams { 1413930e68a5SMark Adams PetscErrorCode ierr; 1414930e68a5SMark Adams 1415930e68a5SMark Adams PetscFunctionBegin; 1416930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 141786a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 141886a27549SJunchao Zhang PetscFunctionReturn(0); 141986a27549SJunchao Zhang } 142086a27549SJunchao Zhang 142186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 142286a27549SJunchao Zhang { 142386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 142486a27549SJunchao Zhang 142586a27549SJunchao Zhang PetscFunctionBegin; 142686a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 142786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 142886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 142986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 143086a27549SJunchao Zhang } 143186a27549SJunchao Zhang PetscFunctionReturn(0); 143286a27549SJunchao Zhang } 143386a27549SJunchao Zhang 143486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 143586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 143686a27549SJunchao Zhang { 143786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1438076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 143986a27549SJunchao Zhang 144086a27549SJunchao Zhang PetscFunctionBegin; 144186a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 144286a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1443076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 144486a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1445076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1446076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 144786a27549SJunchao Zhang 144886a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1449076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1450076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1451076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 145286a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 145386a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 145486a27549SJunchao Zhang 145586a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 145686a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 145786a27549SJunchao Zhang */ 1458076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1459076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 146086a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 146186a27549SJunchao Zhang 146286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 146386a27549SJunchao Zhang 146486a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1465076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 146686a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1467076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1468076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 146986a27549SJunchao Zhang 147086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1471076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1472076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1473076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 147486a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 147586a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 147686a27549SJunchao Zhang 147786a27549SJunchao Zhang /* Sort indices. See comments above */ 1478076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1479076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 148086a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 148186a27549SJunchao Zhang 148286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 148386a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 148486a27549SJunchao Zhang } 148586a27549SJunchao Zhang PetscFunctionReturn(0); 148686a27549SJunchao Zhang } 148786a27549SJunchao Zhang 148886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 148986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 149086a27549SJunchao Zhang { 149186a27549SJunchao Zhang PetscErrorCode ierr; 149286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 149386a27549SJunchao Zhang PetscScalarKokkosView xv; 149486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 149586a27549SJunchao Zhang 149686a27549SJunchao Zhang PetscFunctionBegin; 1497eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 149886a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 149986a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 150086a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 150186a27549SJunchao Zhang /* Solve L tmpv = b */ 15023f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 150386a27549SJunchao Zhang /* Solve Ux = tmpv */ 15043f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 150586a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 150686a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1507eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 150886a27549SJunchao Zhang PetscFunctionReturn(0); 150986a27549SJunchao Zhang } 151086a27549SJunchao Zhang 1511076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 151286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 151386a27549SJunchao Zhang { 151486a27549SJunchao Zhang PetscErrorCode ierr; 151586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 151686a27549SJunchao Zhang PetscScalarKokkosView xv; 151786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 151886a27549SJunchao Zhang 151986a27549SJunchao Zhang PetscFunctionBegin; 1520eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 152186a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 152286a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 152386a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 152486a27549SJunchao Zhang /* Solve U^T tmpv = b */ 152586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 152686a27549SJunchao Zhang 152786a27549SJunchao Zhang /* Solve L^T x = tmpv */ 152886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 152986a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 153086a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1531eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 153286a27549SJunchao Zhang PetscFunctionReturn(0); 153386a27549SJunchao Zhang } 153486a27549SJunchao Zhang 153586a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 153686a27549SJunchao Zhang { 153786a27549SJunchao Zhang PetscErrorCode ierr; 153886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 153986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 154086a27549SJunchao Zhang PetscInt fill_lev = info->levels; 154186a27549SJunchao Zhang 154286a27549SJunchao Zhang PetscFunctionBegin; 1543eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 154486a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1545076ba34aSJunchao Zhang 1546076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1547076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1548076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1549076ba34aSJunchao Zhang 1550076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 155186a27549SJunchao Zhang 155286a27549SJunchao Zhang B->assembled = PETSC_TRUE; 155386a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 155486a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 155586a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 155686a27549SJunchao Zhang B->ops->matsolve = NULL; 155786a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 155886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 155986a27549SJunchao Zhang 156086a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 156186a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 156286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1563eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1564eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 156586a27549SJunchao Zhang PetscFunctionReturn(0); 156686a27549SJunchao Zhang } 156786a27549SJunchao Zhang 156886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 156986a27549SJunchao Zhang { 157086a27549SJunchao Zhang PetscErrorCode ierr; 157186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 157286a27549SJunchao Zhang Mat_SeqAIJ *b; 157386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 157486a27549SJunchao Zhang PetscInt fill_lev = info->levels; 157586a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 157686a27549SJunchao Zhang PetscInt n = A->rmap->n; 157786a27549SJunchao Zhang 157886a27549SJunchao Zhang PetscFunctionBegin; 157986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158086a27549SJunchao Zhang /* Rebuild factors */ 158186a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 158286a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 158386a27549SJunchao Zhang 158486a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 158586a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 158686a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 158786a27549SJunchao Zhang 158886a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 158986a27549SJunchao Zhang 159086a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 159186a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 159286a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 159386a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 159486a27549SJunchao Zhang 159586a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1596076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1597076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1598076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 159986a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 160086a27549SJunchao Zhang 160186a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 160286a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 160386a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 160486a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 160586a27549SJunchao Zhang 160686a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 160786a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 160886a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 160986a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 161086a27549SJunchao Zhang #else 161186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 161286a27549SJunchao Zhang #endif 161386a27549SJunchao Zhang 161486a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 161586a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 161686a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 161786a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 161886a27549SJunchao Zhang 161986a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 162086a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 162186a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 162286a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 162386a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 162486a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 162586a27549SJunchao Zhang 162686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162786a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1628930e68a5SMark Adams PetscFunctionReturn(0); 1629930e68a5SMark Adams } 1630930e68a5SMark Adams 16318f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16328f7e8f9dSMark Adams { 16338f7e8f9dSMark Adams PetscErrorCode ierr; 16348f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16358f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1636930e68a5SMark Adams 16378f7e8f9dSMark Adams PetscFunctionBegin; 16388f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 16398f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16408f7e8f9dSMark Adams // move B data into Kokkos 16418f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 16428f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 16438f7e8f9dSMark Adams { 16448f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 16458f7e8f9dSMark Adams if (!baijkok->diag_d) { 16468f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1647152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 16488f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 16498f7e8f9dSMark Adams } 16508f7e8f9dSMark Adams } 16518f7e8f9dSMark Adams PetscFunctionReturn(0); 16528f7e8f9dSMark Adams } 16538f7e8f9dSMark Adams 165486a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1655930e68a5SMark Adams { 1656930e68a5SMark Adams PetscFunctionBegin; 1657930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1658930e68a5SMark Adams PetscFunctionReturn(0); 1659930e68a5SMark Adams } 1660930e68a5SMark Adams 16618f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16628f7e8f9dSMark Adams { 16638f7e8f9dSMark Adams PetscFunctionBegin; 16648f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16658f7e8f9dSMark Adams PetscFunctionReturn(0); 16668f7e8f9dSMark Adams } 16678f7e8f9dSMark Adams 1668930e68a5SMark Adams /*MC 166986a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 167086a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1671930e68a5SMark Adams 1672930e68a5SMark Adams Level: beginner 1673930e68a5SMark Adams 167486a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1675930e68a5SMark Adams M*/ 167686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1677930e68a5SMark Adams { 1678930e68a5SMark Adams PetscErrorCode ierr; 1679930e68a5SMark Adams PetscInt n = A->rmap->n; 1680930e68a5SMark Adams 1681930e68a5SMark Adams PetscFunctionBegin; 1682930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1683930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1684930e68a5SMark Adams (*B)->factortype = ftype; 16854ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1686930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1687930e68a5SMark Adams 16888f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1689930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 169086a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 169186a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 169286a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 169386a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 169486a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 169586a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 169686a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1697930e68a5SMark Adams 1698930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169986a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1700930e68a5SMark Adams PetscFunctionReturn(0); 1701930e68a5SMark Adams } 17028f7e8f9dSMark Adams 17038f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 17048f7e8f9dSMark Adams { 17058f7e8f9dSMark Adams PetscErrorCode ierr; 17068f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17078f7e8f9dSMark Adams 17088f7e8f9dSMark Adams PetscFunctionBegin; 17098f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 17108f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 17118f7e8f9dSMark Adams (*B)->factortype = ftype; 1712f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17134ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 17148f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17158f7e8f9dSMark Adams 17168f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17178f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17188f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17198f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17208f7e8f9dSMark Adams 17218f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17228f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17238f7e8f9dSMark Adams PetscFunctionReturn(0); 17248f7e8f9dSMark Adams } 172586a27549SJunchao Zhang 172686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 172786a27549SJunchao Zhang { 172886a27549SJunchao Zhang PetscErrorCode ierr; 172986a27549SJunchao Zhang 173086a27549SJunchao Zhang PetscFunctionBegin; 173186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 173286a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 173386a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 173486a27549SJunchao Zhang PetscFunctionReturn(0); 173586a27549SJunchao Zhang } 173686a27549SJunchao Zhang 1737076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1738076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1739076ba34aSJunchao Zhang { 1740076ba34aSJunchao Zhang PetscErrorCode ierr; 1741076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1742076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1743076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1744076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1745076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1746076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1747076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1748076ba34aSJunchao Zhang 1749076ba34aSJunchao Zhang PetscFunctionBegin; 1750c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1751076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1752c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1753076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1754c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1755076ba34aSJunchao Zhang } 1756076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1757076ba34aSJunchao Zhang } 1758076ba34aSJunchao Zhang PetscFunctionReturn(0); 1759076ba34aSJunchao Zhang } 1760