111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 38c3ff71bSJunchao Zhang #include <petscsystypes.h> 48c3ff71bSJunchao Zhang #include <petscerror.h> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 7f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 10*86a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 11*86a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 12*86a27549SJunchao Zhang 138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 148c3ff71bSJunchao Zhang 158c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 16a587d139SMark #include <petscmat.h> 178c3ff71bSJunchao Zhang 188c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 218c3ff71bSJunchao Zhang { 228c3ff71bSJunchao Zhang PetscErrorCode ierr; 23a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 248c3ff71bSJunchao Zhang 258c3ff71bSJunchao Zhang PetscFunctionBegin; 268c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 27*86a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 28a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 29a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 30a587d139SMark } 318c3ff71bSJunchao Zhang /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 328c3ff71bSJunchao Zhang PetscFunctionReturn(0); 338c3ff71bSJunchao Zhang } 348c3ff71bSJunchao Zhang 35*86a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 368c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 378c3ff71bSJunchao Zhang { 38152b3e56SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 398c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408c3ff71bSJunchao Zhang PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 418c3ff71bSJunchao Zhang 428c3ff71bSJunchao Zhang PetscFunctionBegin; 43*86a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 44152b3e56SJunchao Zhang /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 458c3ff71bSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 468c3ff71bSJunchao Zhang delete aijkok; 478c3ff71bSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 488c3ff71bSJunchao Zhang aijkok->nonzerostate = A->nonzerostate; 498c3ff71bSJunchao Zhang A->spptr = aijkok; 508c3ff71bSJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 51*86a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 52152b3e56SJunchao Zhang aijkok->a_dual.modify_host(); /* Mark host is modified */ 53152b3e56SJunchao Zhang aijkok->a_dual.sync_device(); /* Sync the device */ 54*86a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 55*86a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 568c3ff71bSJunchao Zhang } 578c3ff71bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 588c3ff71bSJunchao Zhang PetscFunctionReturn(0); 598c3ff71bSJunchao Zhang } 608c3ff71bSJunchao Zhang 61*86a27549SJunchao Zhang /* Mark the CSR data on device is modified */ 62*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 63*86a27549SJunchao Zhang { 64*86a27549SJunchao Zhang PetscErrorCode ierr; 65*86a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 66*86a27549SJunchao Zhang 67*86a27549SJunchao Zhang PetscFunctionBegin; 68*86a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 69*86a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 70*86a27549SJunchao Zhang aijkok->a_dual.modify_device(); 71*86a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 72*86a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 73*86a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_GPU; 74*86a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 75*86a27549SJunchao Zhang PetscFunctionReturn(0); 76*86a27549SJunchao Zhang } 77*86a27549SJunchao Zhang 78f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 79f0cf5187SStefano Zampini { 80f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 81f0cf5187SStefano Zampini 82f0cf5187SStefano Zampini PetscFunctionBegin; 83f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 84*86a27549SJunchao Zhang /* We do not expect one needs factors on host */ 85*86a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 86f0cf5187SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 87f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 88*86a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 89*86a27549SJunchao Zhang aijkok->a_dual.modify_device(); /* Mark device is modified */ 90*86a27549SJunchao Zhang aijkok->a_dual.sync_host(); /* Sync the host */ 91f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 92f0cf5187SStefano Zampini } 93f0cf5187SStefano Zampini PetscFunctionReturn(0); 94f0cf5187SStefano Zampini } 95f0cf5187SStefano Zampini 96f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 97f0cf5187SStefano Zampini { 98f0cf5187SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 99f0cf5187SStefano Zampini PetscErrorCode ierr; 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini PetscFunctionBegin; 102f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 103f0cf5187SStefano Zampini *array = a->a; 104f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 105f0cf5187SStefano Zampini PetscFunctionReturn(0); 106f0cf5187SStefano Zampini } 107f0cf5187SStefano Zampini 108a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 109a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 110a587d139SMark { 111a587d139SMark Mat_SeqAIJKokkos *aijkok; 112a587d139SMark Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 113a587d139SMark 114a587d139SMark PetscFunctionBegin; 115a587d139SMark // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 116a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 117a587d139SMark if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 118152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 119a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 120a587d139SMark PetscFunctionReturn(0); 121a587d139SMark } 122a587d139SMark 123a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 124a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 125a587d139SMark { 126a587d139SMark Mat_SeqAIJKokkos *aijkok; 127a587d139SMark 128a587d139SMark PetscFunctionBegin; 129a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 131a587d139SMark *d_mat = aijkok->device_mat_d.data(); 132a587d139SMark } else { 133a587d139SMark PetscErrorCode ierr; 134a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 135a587d139SMark *d_mat = NULL; 136a587d139SMark } 137a587d139SMark PetscFunctionReturn(0); 138a587d139SMark } 139152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 140152b3e56SJunchao Zhang { 141152b3e56SJunchao Zhang PetscErrorCode ierr; 142152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 143152b3e56SJunchao Zhang 144152b3e56SJunchao Zhang PetscFunctionBegin; 145152b3e56SJunchao Zhang if (!aijkok->At) { /* Generate At for the first time */ 146152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 147152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 148*86a27549SJunchao Zhang } else if (!aijkok->transpose_updated) { /* Only update At values */ 149152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 150152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 151152b3e56SJunchao Zhang } 152*86a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 153152b3e56SJunchao Zhang PetscFunctionReturn(0); 154152b3e56SJunchao Zhang } 155152b3e56SJunchao Zhang 156*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 157152b3e56SJunchao Zhang { 158152b3e56SJunchao Zhang PetscErrorCode ierr; 159152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 160152b3e56SJunchao Zhang 161152b3e56SJunchao Zhang PetscFunctionBegin; 162152b3e56SJunchao Zhang if (!aijkok->Ah) { /* Generate Ah for the first time */ 163152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 164152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 165152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 166*86a27549SJunchao Zhang } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 167152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 168152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 169152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 170152b3e56SJunchao Zhang } 171*86a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 172152b3e56SJunchao Zhang PetscFunctionReturn(0); 173152b3e56SJunchao Zhang } 174a587d139SMark 1758c3ff71bSJunchao Zhang /* y = A x */ 1768c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1778c3ff71bSJunchao Zhang { 1788c3ff71bSJunchao Zhang PetscErrorCode ierr; 1798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 180152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 181152b3e56SJunchao Zhang PetscScalarKokkosView yv; 1828c3ff71bSJunchao Zhang 1838c3ff71bSJunchao Zhang PetscFunctionBegin; 1848c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 186152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 1878c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 189152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 190152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 191bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 192152b3e56SJunchao Zhang /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 193152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 1948c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1958c3ff71bSJunchao Zhang } 1968c3ff71bSJunchao Zhang 1978c3ff71bSJunchao Zhang /* y = A^T x */ 1988c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1998c3ff71bSJunchao Zhang { 2008c3ff71bSJunchao Zhang PetscErrorCode ierr; 2018c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 202152b3e56SJunchao Zhang Mat B; 203152b3e56SJunchao Zhang const char *mode; 204152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 205152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2068c3ff71bSJunchao Zhang 2078c3ff71bSJunchao Zhang PetscFunctionBegin; 2088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 210152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 211152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 212152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 213152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 214152b3e56SJunchao Zhang mode = "N"; 215152b3e56SJunchao Zhang } else { 216152b3e56SJunchao Zhang B = A; 217152b3e56SJunchao Zhang mode = "T"; 218152b3e56SJunchao Zhang } 219152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 220152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 221152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 222152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 223bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 224152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2258c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2268c3ff71bSJunchao Zhang } 2278c3ff71bSJunchao Zhang 2288c3ff71bSJunchao Zhang /* y = A^H x */ 2298c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2308c3ff71bSJunchao Zhang { 2318c3ff71bSJunchao Zhang PetscErrorCode ierr; 2328c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 233152b3e56SJunchao Zhang Mat B; 234152b3e56SJunchao Zhang const char *mode; 235152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 236152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2378c3ff71bSJunchao Zhang 2388c3ff71bSJunchao Zhang PetscFunctionBegin; 2398c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 240152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 241152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 242152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 243*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 244152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 245152b3e56SJunchao Zhang mode = "N"; 246152b3e56SJunchao Zhang } else { 247152b3e56SJunchao Zhang B = A; 248152b3e56SJunchao Zhang mode = "C"; 249152b3e56SJunchao Zhang } 250152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 251152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 252152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 254bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 255152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2578c3ff71bSJunchao Zhang } 2588c3ff71bSJunchao Zhang 2598c3ff71bSJunchao Zhang /* z = A x + y */ 2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 2618c3ff71bSJunchao Zhang { 2628c3ff71bSJunchao Zhang PetscErrorCode ierr; 2638c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 264152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 265152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2668c3ff71bSJunchao Zhang 2678c3ff71bSJunchao Zhang PetscFunctionBegin; 2688c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 269152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 270152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 271152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2728c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2738c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 274152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 275152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 276152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 277152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 278bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 279152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2808c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2818c3ff71bSJunchao Zhang } 2828c3ff71bSJunchao Zhang 2838c3ff71bSJunchao Zhang /* z = A^T x + y */ 2848c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2858c3ff71bSJunchao Zhang { 2868c3ff71bSJunchao Zhang PetscErrorCode ierr; 2878c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 288152b3e56SJunchao Zhang Mat B; 289152b3e56SJunchao Zhang const char *mode; 290152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 291152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2928c3ff71bSJunchao Zhang 2938c3ff71bSJunchao Zhang PetscFunctionBegin; 2948c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 295152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 296152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 297152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2988c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 299152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 300152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 301152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 302152b3e56SJunchao Zhang mode = "N"; 303152b3e56SJunchao Zhang } else { 304152b3e56SJunchao Zhang B = A; 305152b3e56SJunchao Zhang mode = "T"; 306152b3e56SJunchao Zhang } 307152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 308152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 309152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 310152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 311152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 312bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 313152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3148c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3158c3ff71bSJunchao Zhang } 3168c3ff71bSJunchao Zhang 3178c3ff71bSJunchao Zhang /* z = A^H x + y */ 3188c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3198c3ff71bSJunchao Zhang { 3208c3ff71bSJunchao Zhang PetscErrorCode ierr; 3218c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 322152b3e56SJunchao Zhang Mat B; 323152b3e56SJunchao Zhang const char *mode; 324152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 325152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3268c3ff71bSJunchao Zhang 3278c3ff71bSJunchao Zhang PetscFunctionBegin; 3288c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 330152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 331152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3328c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 333152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 334*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 335152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 336152b3e56SJunchao Zhang mode = "N"; 337152b3e56SJunchao Zhang } else { 338152b3e56SJunchao Zhang B = A; 339152b3e56SJunchao Zhang mode = "C"; 340152b3e56SJunchao Zhang } 341152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 342152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 343152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348152b3e56SJunchao Zhang PetscFunctionReturn(0); 349152b3e56SJunchao Zhang } 350152b3e56SJunchao Zhang 351152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 352152b3e56SJunchao Zhang { 353152b3e56SJunchao Zhang PetscErrorCode ierr; 354152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355152b3e56SJunchao Zhang 356152b3e56SJunchao Zhang PetscFunctionBegin; 357152b3e56SJunchao Zhang switch (op) { 358152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 359152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 360152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 361152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 362152b3e56SJunchao Zhang break; 363152b3e56SJunchao Zhang default: 364152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 365152b3e56SJunchao Zhang break; 366152b3e56SJunchao Zhang } 3678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3688c3ff71bSJunchao Zhang } 3698c3ff71bSJunchao Zhang 3703d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3718c3ff71bSJunchao Zhang { 3728c3ff71bSJunchao Zhang PetscErrorCode ierr; 3738c3ff71bSJunchao Zhang Mat B; 374c58ef05eSStefano Zampini Mat_SeqAIJ *aij; 3758c3ff71bSJunchao Zhang 3768c3ff71bSJunchao Zhang PetscFunctionBegin; 377a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 3788c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 3798c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3808c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 3818c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang } 3838c3ff71bSJunchao Zhang 3848c3ff71bSJunchao Zhang B = *newmat; 3858c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 386*86a27549SJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 387*86a27549SJunchao Zhang 3888c3ff71bSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 3890d8bce8aSStefano Zampini ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 390f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 391c58ef05eSStefano Zampini /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 392c58ef05eSStefano Zampini aij = (Mat_SeqAIJ*)B->data; 393c58ef05eSStefano Zampini aij->inode.use = PETSC_FALSE; 3948c3ff71bSJunchao Zhang 3958c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3968c3ff71bSJunchao Zhang } 3978c3ff71bSJunchao Zhang 3988c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 3998c3ff71bSJunchao Zhang { 4008c3ff71bSJunchao Zhang PetscErrorCode ierr; 4018c3ff71bSJunchao Zhang 4028c3ff71bSJunchao Zhang PetscFunctionBegin; 4038c3ff71bSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 4048c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 4058c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4068c3ff71bSJunchao Zhang } 4078c3ff71bSJunchao Zhang 4088c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 4098c3ff71bSJunchao Zhang { 4108c3ff71bSJunchao Zhang PetscErrorCode ierr; 411*86a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 4128c3ff71bSJunchao Zhang 4138c3ff71bSJunchao Zhang PetscFunctionBegin; 414*86a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 415*86a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 4168f7e8f9dSMark Adams if (aijkok) { 4178f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 418a587d139SMark delete aijkok->colmap_d; 419a587d139SMark delete aijkok->i_uncompressed_d; 420a587d139SMark } 4218f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 4228f7e8f9dSMark Adams } 4238c3ff71bSJunchao Zhang delete aijkok; 424*86a27549SJunchao Zhang } else { 425*86a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 426*86a27549SJunchao Zhang } 427f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 428152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 429152b3e56SJunchao Zhang A->spptr = NULL; 4308c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 4318c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4328c3ff71bSJunchao Zhang } 4338c3ff71bSJunchao Zhang 434*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 435*86a27549SJunchao Zhang { 436*86a27549SJunchao Zhang PetscErrorCode ierr; 437*86a27549SJunchao Zhang 438*86a27549SJunchao Zhang PetscFunctionBegin; 439*86a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 440*86a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 441*86a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 442*86a27549SJunchao Zhang PetscFunctionReturn(0); 443*86a27549SJunchao Zhang } 444*86a27549SJunchao Zhang 445a3f881fbSStefano Zampini #if 0 446a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 447a3f881fbSStefano Zampini { 448a3f881fbSStefano Zampini MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 449a3f881fbSStefano Zampini 450a3f881fbSStefano Zampini PetscFunctionBegin; 451a3f881fbSStefano Zampini delete kh; 452a3f881fbSStefano Zampini PetscFunctionReturn(0); 453a3f881fbSStefano Zampini } 454a3f881fbSStefano Zampini 455a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 456a3f881fbSStefano Zampini { 457a3f881fbSStefano Zampini Mat_Product *product = C->product; 458a3f881fbSStefano Zampini Mat A,B; 459a3f881fbSStefano Zampini MatProductType ptype; 460a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 461a3f881fbSStefano Zampini bool tA,tB; 462a3f881fbSStefano Zampini PetscErrorCode ierr; 463a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 464a3f881fbSStefano Zampini Mat_SeqAIJ *c; 465a3f881fbSStefano Zampini 466a3f881fbSStefano Zampini PetscFunctionBegin; 467a3f881fbSStefano Zampini MatCheckProduct(C,1); 468a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 469a3f881fbSStefano Zampini A = product->A; 470a3f881fbSStefano Zampini B = product->B; 471a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 472a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 473a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 474a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 475a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 476a3f881fbSStefano Zampini kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 477a3f881fbSStefano Zampini ptype = product->type; 478a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 479a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 480a3f881fbSStefano Zampini switch (ptype) { 481a3f881fbSStefano Zampini case MATPRODUCT_AB: 482a3f881fbSStefano Zampini tA = false; 483a3f881fbSStefano Zampini tB = false; 484a3f881fbSStefano Zampini break; 485a3f881fbSStefano Zampini case MATPRODUCT_AtB: 486a3f881fbSStefano Zampini tA = true; 487a3f881fbSStefano Zampini tB = false; 488a3f881fbSStefano Zampini break; 489a3f881fbSStefano Zampini case MATPRODUCT_ABt: 490a3f881fbSStefano Zampini tA = false; 491a3f881fbSStefano Zampini tB = true; 492a3f881fbSStefano Zampini break; 493a3f881fbSStefano Zampini default: 494a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 495a3f881fbSStefano Zampini } 496a3f881fbSStefano Zampini 497a3f881fbSStefano Zampini KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 498a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 499a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 500a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 501a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 502a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 503a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 504a3f881fbSStefano Zampini c->reallocs = 0; 505a3f881fbSStefano Zampini C->info.mallocs += 0; 506a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 507a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 508a3f881fbSStefano Zampini C->num_ass++; 509a3f881fbSStefano Zampini /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 510a3f881fbSStefano Zampini // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 511a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_BOTH; 512a3f881fbSStefano Zampini // Also, we should add support to copy back from device to host 513a3f881fbSStefano Zampini PetscFunctionReturn(0); 514a3f881fbSStefano Zampini } 515a3f881fbSStefano Zampini 516a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 517a3f881fbSStefano Zampini { 518a3f881fbSStefano Zampini Mat_Product *product = C->product; 519a3f881fbSStefano Zampini Mat A,B; 520a3f881fbSStefano Zampini MatProductType ptype; 521a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 522a3f881fbSStefano Zampini PetscInt m,n,k; 523a3f881fbSStefano Zampini bool tA,tB; 524a3f881fbSStefano Zampini PetscErrorCode ierr; 525a3f881fbSStefano Zampini Mat_SeqAIJ *c; 526a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 527a3f881fbSStefano Zampini 528a3f881fbSStefano Zampini PetscFunctionBegin; 529a3f881fbSStefano Zampini MatCheckProduct(C,1); 530a3f881fbSStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 531a3f881fbSStefano Zampini A = product->A; 532a3f881fbSStefano Zampini B = product->B; 533a3f881fbSStefano Zampini // TODO only copy the i,j data, not the values 534a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 535a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 536a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 538a3f881fbSStefano Zampini ptype = product->type; 539a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 540a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 541a3f881fbSStefano Zampini switch (ptype) { 542a3f881fbSStefano Zampini case MATPRODUCT_AB: 543a3f881fbSStefano Zampini tA = false; 544a3f881fbSStefano Zampini tB = false; 545a3f881fbSStefano Zampini m = A->rmap->n; 546a3f881fbSStefano Zampini n = B->cmap->n; 547a3f881fbSStefano Zampini break; 548a3f881fbSStefano Zampini case MATPRODUCT_AtB: 549a3f881fbSStefano Zampini tA = true; 550a3f881fbSStefano Zampini tB = false; 551a3f881fbSStefano Zampini m = A->cmap->n; 552a3f881fbSStefano Zampini n = B->cmap->n; 553a3f881fbSStefano Zampini break; 554a3f881fbSStefano Zampini case MATPRODUCT_ABt: 555a3f881fbSStefano Zampini tA = false; 556a3f881fbSStefano Zampini tB = true; 557a3f881fbSStefano Zampini m = A->rmap->n; 558a3f881fbSStefano Zampini n = B->rmap->n; 559a3f881fbSStefano Zampini break; 560a3f881fbSStefano Zampini default: 561a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 562a3f881fbSStefano Zampini } 563a3f881fbSStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 564a3f881fbSStefano Zampini ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 565a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 566a3f881fbSStefano Zampini 567a3f881fbSStefano Zampini kh = new MatMatKernelHandle_t; 568a3f881fbSStefano Zampini // TODO SZ: ADD RUNTIME SELECTION OF THESE 569a3f881fbSStefano Zampini kh->set_team_work_size(16); 570a3f881fbSStefano Zampini kh->set_dynamic_scheduling(true); 571a3f881fbSStefano Zampini // Select an spgemm algorithm, limited by configuration at compile-time and 572a3f881fbSStefano Zampini // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 573a3f881fbSStefano Zampini // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 574a3f881fbSStefano Zampini std::string myalg("SPGEMM_KK_MEMORY"); 575a3f881fbSStefano Zampini kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 576a3f881fbSStefano Zampini 577a3f881fbSStefano Zampini ///////////////////////////////////// 578a3f881fbSStefano Zampini // TODO JZ 579a3f881fbSStefano Zampini ckok = NULL; //new Mat_SeqAIJKokkos(); 580a3f881fbSStefano Zampini C->spptr = ckok; 581a3f881fbSStefano Zampini KokkosCsrMatrix_t ccsr; // here only to have the code compile 582a3f881fbSStefano Zampini KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 583a3f881fbSStefano Zampini //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 584a3f881fbSStefano Zampini //c->nz = get_nnz_from_ccsr 585a3f881fbSStefano Zampini ////////////////////////////////////// 586a3f881fbSStefano Zampini c->singlemalloc = PETSC_FALSE; 587a3f881fbSStefano Zampini c->free_a = PETSC_TRUE; 588a3f881fbSStefano Zampini c->free_ij = PETSC_TRUE; 589a3f881fbSStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 590a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 591a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 592a3f881fbSStefano Zampini //////////////////////////////////// 593a3f881fbSStefano Zampini // TODO JZ copy from device to c->i and c->j 594a3f881fbSStefano Zampini //////////////////////////////////// 595a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 596a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 597a3f881fbSStefano Zampini c->maxnz = c->nz; 598a3f881fbSStefano Zampini c->nonzerorowcnt = 0; 599a3f881fbSStefano Zampini c->rmax = 0; 600a3f881fbSStefano Zampini for (k = 0; k < m; k++) { 601a3f881fbSStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 602a3f881fbSStefano Zampini c->ilen[k] = c->imax[k] = nn; 603a3f881fbSStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 604a3f881fbSStefano Zampini c->rmax = PetscMax(c->rmax,nn); 605a3f881fbSStefano Zampini } 606a3f881fbSStefano Zampini 607a3f881fbSStefano Zampini C->nonzerostate++; 608a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 609a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 610a3f881fbSStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 611a3f881fbSStefano Zampini ckok->nonzerostate = C->nonzerostate; 612a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 613a3f881fbSStefano Zampini C->preallocated = PETSC_TRUE; 614a3f881fbSStefano Zampini C->assembled = PETSC_FALSE; 615a3f881fbSStefano Zampini C->was_assembled = PETSC_FALSE; 616a3f881fbSStefano Zampini 617a3f881fbSStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 618a3f881fbSStefano Zampini C->product->data = kh; 619a3f881fbSStefano Zampini C->product->destroy = MatMatKernelHandleDestroy_Private; 620a3f881fbSStefano Zampini PetscFunctionReturn(0); 621a3f881fbSStefano Zampini } 622a3f881fbSStefano Zampini 623a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 624a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 625a3f881fbSStefano Zampini { 626a3f881fbSStefano Zampini Mat_Product *product = mat->product; 627a3f881fbSStefano Zampini PetscErrorCode ierr; 628a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 629a3f881fbSStefano Zampini 630a3f881fbSStefano Zampini PetscFunctionBegin; 631a3f881fbSStefano Zampini MatCheckProduct(mat,1); 632a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 633a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 634a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 635a3f881fbSStefano Zampini } 636a3f881fbSStefano Zampini if (Biskok && Ciskok) { 637a3f881fbSStefano Zampini switch (product->type) { 638a3f881fbSStefano Zampini case MATPRODUCT_AB: 639a3f881fbSStefano Zampini case MATPRODUCT_AtB: 640a3f881fbSStefano Zampini case MATPRODUCT_ABt: 641a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 642a3f881fbSStefano Zampini break; 643a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 644a3f881fbSStefano Zampini case MATPRODUCT_RARt: 645a3f881fbSStefano Zampini case MATPRODUCT_ABC: 646a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 647a3f881fbSStefano Zampini break; 648a3f881fbSStefano Zampini default: 649a3f881fbSStefano Zampini break; 650a3f881fbSStefano Zampini } 651a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 652a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 653a3f881fbSStefano Zampini } 654a3f881fbSStefano Zampini PetscFunctionReturn(0); 655a3f881fbSStefano Zampini } 656a3f881fbSStefano Zampini #endif 657a3f881fbSStefano Zampini 658a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 659a587d139SMark { 660a587d139SMark PetscErrorCode ierr; 661a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 662a587d139SMark PetscFunctionBegin; 663*86a27549SJunchao Zhang if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 664a587d139SMark ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 665a587d139SMark PetscFunctionReturn(0); 666a587d139SMark } 667a587d139SMark 668f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 669f0cf5187SStefano Zampini { 670f0cf5187SStefano Zampini PetscErrorCode ierr; 671f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 672f0cf5187SStefano Zampini 673f0cf5187SStefano Zampini PetscFunctionBegin; 674f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 675f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 676f0cf5187SStefano Zampini KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 677f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 678f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 679f0cf5187SStefano Zampini ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 680f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 681f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 682f0cf5187SStefano Zampini PetscFunctionReturn(0); 683f0cf5187SStefano Zampini } 684f0cf5187SStefano Zampini 685a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 686a587d139SMark { 687a587d139SMark PetscErrorCode ierr; 688a587d139SMark PetscBool both = PETSC_FALSE; 689a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 690a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 691a587d139SMark 692a587d139SMark PetscFunctionBegin; 693a587d139SMark if (aijkok && aijkok->a_d.data()) { 694f0cf5187SStefano Zampini KokkosBlas::fill(aijkok->a_d,0.0); 695a587d139SMark both = PETSC_TRUE; 696a587d139SMark } 697a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 698a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 699a587d139SMark if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 700a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 701a587d139SMark PetscFunctionReturn(0); 702a587d139SMark } 703a587d139SMark 704f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 705a587d139SMark { 706a587d139SMark PetscErrorCode ierr; 707a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 708a587d139SMark 709a587d139SMark PetscFunctionBegin; 710f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 711a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 712a587d139SMark PetscFunctionReturn(0); 713a587d139SMark } 714f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 715a587d139SMark PetscBool e; 716a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 717a587d139SMark if (e) { 718a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 719f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 720a587d139SMark } 721a587d139SMark } 722a587d139SMark if (str != SAME_NONZERO_PATTERN) { 723a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 724a587d139SMark PetscFunctionReturn(0); 725a587d139SMark } else { 726a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 727a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 728a587d139SMark PetscFunctionReturn(0); 729f0cf5187SStefano Zampini } else { 730a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 731f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 732a587d139SMark } 733a587d139SMark Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 734a587d139SMark Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 735a587d139SMark if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 736f0cf5187SStefano Zampini KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 737f0cf5187SStefano Zampini Y->offloadmask = PETSC_OFFLOAD_GPU; 738f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 739a587d139SMark ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 740f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 741f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 742a587d139SMark } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 743a587d139SMark } 744a587d139SMark PetscFunctionReturn(0); 745a587d139SMark } 746a587d139SMark 747*86a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 7488f7e8f9dSMark Adams { 7498f7e8f9dSMark Adams PetscErrorCode ierr; 7508f7e8f9dSMark Adams 7518f7e8f9dSMark Adams PetscFunctionBegin; 7528f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 7538f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 7548f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 7558f7e8f9dSMark Adams PetscFunctionReturn(0); 7568f7e8f9dSMark Adams } 7578f7e8f9dSMark Adams 7588c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 7598c3ff71bSJunchao Zhang { 7608c3ff71bSJunchao Zhang PetscFunctionBegin; 7616f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 7626f3d89d0SStefano Zampini 763a587d139SMark A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 7648c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 7658c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 7668c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 767a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 768f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 769a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 770a3f881fbSStefano Zampini //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 7718c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 7728c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 7738c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 7748c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 7758c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 7768c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 777152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 7788c3ff71bSJunchao Zhang PetscFunctionReturn(0); 7798c3ff71bSJunchao Zhang } 7808c3ff71bSJunchao Zhang 7818c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 782152b3e56SJunchao Zhang /*@C 7838c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 7848c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 7858c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 7868c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 7878c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 7888c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 7898c3ff71bSJunchao Zhang 7908c3ff71bSJunchao Zhang Collective 7918c3ff71bSJunchao Zhang 7928c3ff71bSJunchao Zhang Input Parameters: 7938c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 7948c3ff71bSJunchao Zhang . m - number of rows 7958c3ff71bSJunchao Zhang . n - number of columns 7968c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 7978c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 7988c3ff71bSJunchao Zhang (possibly different for each row) or NULL 7998c3ff71bSJunchao Zhang 8008c3ff71bSJunchao Zhang Output Parameter: 8018c3ff71bSJunchao Zhang . A - the matrix 8028c3ff71bSJunchao Zhang 8038c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 8048c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 8058c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 8068c3ff71bSJunchao Zhang 8078c3ff71bSJunchao Zhang Notes: 8088c3ff71bSJunchao Zhang If nnz is given then nz is ignored 8098c3ff71bSJunchao Zhang 8108c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 8118c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 8128c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 8138c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 8148c3ff71bSJunchao Zhang 8158c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 8168c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 8178c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 8188c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 8198c3ff71bSJunchao Zhang 8208c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 8218c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 8228c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 8238c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 8248c3ff71bSJunchao Zhang 8258c3ff71bSJunchao Zhang Level: intermediate 8268c3ff71bSJunchao Zhang 8278c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 8288c3ff71bSJunchao Zhang @*/ 8298c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 8308c3ff71bSJunchao Zhang { 8318c3ff71bSJunchao Zhang PetscErrorCode ierr; 8328c3ff71bSJunchao Zhang 8338c3ff71bSJunchao Zhang PetscFunctionBegin; 8348c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 8358c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 8368c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 8378c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 8388c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 8398c3ff71bSJunchao Zhang PetscFunctionReturn(0); 8408c3ff71bSJunchao Zhang } 841930e68a5SMark Adams 842930e68a5SMark Adams // factorizations 8438f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 8448f7e8f9dSMark Adams // 8458f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 8468f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 8478f7e8f9dSMark Adams // 8488f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 849930e68a5SMark Adams { 8508f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 8518f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 8528f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 853930e68a5SMark Adams PetscErrorCode ierr; 8548f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 8558f7e8f9dSMark Adams const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 8568f7e8f9dSMark Adams const PetscScalar *aa_d = aijkok->a_d.data(); 8578f7e8f9dSMark Adams PetscScalar *ba_d = baijkok->a_d.data(); 8588f7e8f9dSMark Adams PetscBool row_identity,col_identity; 8598f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 8608f7e8f9dSMark Adams PetscContainer container; 861930e68a5SMark Adams 862930e68a5SMark Adams PetscFunctionBegin; 8638f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 8648f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 8658f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 8668f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 8678f7e8f9dSMark Adams if (container) { 8688f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 8698f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 8708f7e8f9dSMark Adams Nf = (*pNf)%1000; 8718f7e8f9dSMark Adams nv = (*pNf)/1000; 8728f7e8f9dSMark Adams if (nv>0) nVec = nv; 8738f7e8f9dSMark Adams } else Nf = 1; 8748f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 8758f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 8768f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 8778f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 8788f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 8798f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 8808f7e8f9dSMark Adams #endif 8818f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 8828f7e8f9dSMark Adams { 8838f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 8848f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 8858f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 8868f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 8878f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 8888f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 8898f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 8908f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 8918f7e8f9dSMark Adams size_t flops_h = 0.0; 8928f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 8938f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 8948f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 8958f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 8968f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 8978f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 8988f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 8998f7e8f9dSMark Adams // Fill A --> fact 9008f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 9018f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 9028f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 9038f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 9048f7e8f9dSMark Adams // zero rows of B 9058f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9068f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 9078f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 9088f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 9098f7e8f9dSMark Adams /* zero (unfactored row) */ 9108f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 9118f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 9128f7e8f9dSMark Adams }); 9138f7e8f9dSMark Adams // copy A into B 9148f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9158f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 9168f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 9178f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 9188f7e8f9dSMark Adams /* load in initial (unfactored row) */ 9198f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 9208f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 9218f7e8f9dSMark Adams PetscScalar vala = av[j]; 9228f7e8f9dSMark Adams if (colb == rowb) { 9238f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 9248f7e8f9dSMark Adams } else { 9258f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9268f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9278f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 9288f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 9298f7e8f9dSMark Adams if (pbj[j] == colb) { 9308f7e8f9dSMark Adams pba[j] = vala; 9318f7e8f9dSMark Adams set++; 9328f7e8f9dSMark Adams break; 9338f7e8f9dSMark Adams } 9348f7e8f9dSMark Adams } 9358f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 9368f7e8f9dSMark Adams } 9378f7e8f9dSMark Adams } 9388f7e8f9dSMark Adams }); 9398f7e8f9dSMark Adams }); 9408f7e8f9dSMark Adams Kokkos::fence(); 941930e68a5SMark Adams 9428f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 9438f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 9448f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 9458f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 9468f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 9478f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 9488f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 9498f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 9508f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 9518f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 9528f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 9538f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 9548f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 9558f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 9568f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 9578f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 9588f7e8f9dSMark Adams else { 9598f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 9608f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 9618f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 9628f7e8f9dSMark Adams size_t st_idx; 9638f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 9648f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 9658f7e8f9dSMark Adams // get column, there has got to be a better way 9668f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 9678f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 9688f7e8f9dSMark Adams if (pjL[j] == ii) { 9698f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 9708f7e8f9dSMark Adams idx = j; // output 9718f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 9728f7e8f9dSMark Adams } 9738f7e8f9dSMark Adams }, st_idx); 9748f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 9758f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 9768f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 9778f7e8f9dSMark Adams // U(i+1,:end) 9788f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 9798f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 9808f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 9818f7e8f9dSMark Adams if (col==myk) { 9828f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 9838f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 9848f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 9858f7e8f9dSMark Adams } else { 9868f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 9878f7e8f9dSMark Adams PetscInt high, low; 9888f7e8f9dSMark Adams const PetscInt *startj; 9898f7e8f9dSMark Adams if (col<myk) { // L 9908f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 9918f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 9928f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 9938f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 9948f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 9958f7e8f9dSMark Adams } else { 9968f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 9978f7e8f9dSMark Adams start = ba_d + idx; 9988f7e8f9dSMark Adams startj= bj_d + idx; 9998f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 10008f7e8f9dSMark Adams } 10018f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 10028f7e8f9dSMark Adams low = 0; 10038f7e8f9dSMark Adams high = (PetscInt)(end-start); 10048f7e8f9dSMark Adams while (high-low > 5) { 10058f7e8f9dSMark Adams int t = (low+high)/2; 10068f7e8f9dSMark Adams if (startj[t] > col) high = t; 10078f7e8f9dSMark Adams else low = t; 10088f7e8f9dSMark Adams } 10098f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 10108f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 10118f7e8f9dSMark Adams } 10128f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 10138f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 10148f7e8f9dSMark Adams } 10158f7e8f9dSMark Adams }); 10168f7e8f9dSMark Adams } 10178f7e8f9dSMark Adams } 10188f7e8f9dSMark Adams }); 10198f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 10208f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 10218f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 10228f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 10238f7e8f9dSMark Adams }); 10248f7e8f9dSMark Adams Kokkos::fence(); 10258f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 10268f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 10278f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 10288f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 10298f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 10308f7e8f9dSMark Adams #endif 10318f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 10328f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 10338f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 10348f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 10358f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 10368f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 10378f7e8f9dSMark Adams if (i < end) { 10388f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 10398f7e8f9dSMark Adams *pv = 1.0/(*pv); 10408f7e8f9dSMark Adams } 10418f7e8f9dSMark Adams }); 10428f7e8f9dSMark Adams }); 10438f7e8f9dSMark Adams } 10448f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 10458f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 10468f7e8f9dSMark Adams #endif 10478f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 10488f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 10498f7e8f9dSMark Adams 10508f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 10518f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 10528f7e8f9dSMark Adams if (b->inode.size) { 10538f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 10548f7e8f9dSMark Adams } else if (row_identity && col_identity) { 10558f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 10568f7e8f9dSMark Adams } else { 10578f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 10588f7e8f9dSMark Adams } 10598f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 10608f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 10618f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 10628f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 10638f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 10648f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 10658f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 10668f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 10678f7e8f9dSMark Adams 1068930e68a5SMark Adams PetscFunctionReturn(0); 1069930e68a5SMark Adams } 1070930e68a5SMark Adams 1071*86a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1072930e68a5SMark Adams { 1073930e68a5SMark Adams PetscErrorCode ierr; 1074930e68a5SMark Adams 1075930e68a5SMark Adams PetscFunctionBegin; 1076930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1077*86a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1078*86a27549SJunchao Zhang PetscFunctionReturn(0); 1079*86a27549SJunchao Zhang } 1080*86a27549SJunchao Zhang 1081*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1082*86a27549SJunchao Zhang { 1083*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1084*86a27549SJunchao Zhang 1085*86a27549SJunchao Zhang PetscFunctionBegin; 1086*86a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 1087*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1088*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1089*86a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 1090*86a27549SJunchao Zhang } 1091*86a27549SJunchao Zhang PetscFunctionReturn(0); 1092*86a27549SJunchao Zhang } 1093*86a27549SJunchao Zhang 1094*86a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1095*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1096*86a27549SJunchao Zhang { 1097*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1098*86a27549SJunchao Zhang MatColumnIndexType n = A->rmap->n; 1099*86a27549SJunchao Zhang 1100*86a27549SJunchao Zhang PetscFunctionBegin; 1101*86a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1102*86a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1103*86a27549SJunchao Zhang factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 1104*86a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1105*86a27549SJunchao Zhang factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1106*86a27549SJunchao Zhang factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1107*86a27549SJunchao Zhang 1108*86a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1109*86a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1110*86a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1111*86a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 1112*86a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1113*86a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 1114*86a27549SJunchao Zhang 1115*86a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1116*86a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 1117*86a27549SJunchao Zhang */ 1118*86a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1119*86a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1120*86a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 1121*86a27549SJunchao Zhang 1122*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1123*86a27549SJunchao Zhang 1124*86a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1125*86a27549SJunchao Zhang factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 1126*86a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1127*86a27549SJunchao Zhang factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1128*86a27549SJunchao Zhang factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1129*86a27549SJunchao Zhang 1130*86a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1131*86a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 1132*86a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 1133*86a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 1134*86a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1135*86a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 1136*86a27549SJunchao Zhang 1137*86a27549SJunchao Zhang /* Sort indices. See comments above */ 1138*86a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 1139*86a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 1140*86a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 1141*86a27549SJunchao Zhang 1142*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1143*86a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 1144*86a27549SJunchao Zhang } 1145*86a27549SJunchao Zhang PetscFunctionReturn(0); 1146*86a27549SJunchao Zhang } 1147*86a27549SJunchao Zhang 1148*86a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1149*86a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1150*86a27549SJunchao Zhang { 1151*86a27549SJunchao Zhang PetscErrorCode ierr; 1152*86a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 1153*86a27549SJunchao Zhang PetscScalarKokkosView xv; 1154*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1155*86a27549SJunchao Zhang 1156*86a27549SJunchao Zhang PetscFunctionBegin; 1157*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1158*86a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1159*86a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1160*86a27549SJunchao Zhang /* Solve L tmpv = b */ 1161*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector); 1162*86a27549SJunchao Zhang /* Solve Ux = tmpv */ 1163*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv); 1164*86a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1165*86a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1166*86a27549SJunchao Zhang PetscFunctionReturn(0); 1167*86a27549SJunchao Zhang } 1168*86a27549SJunchao Zhang 1169*86a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */ 1170*86a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1171*86a27549SJunchao Zhang { 1172*86a27549SJunchao Zhang PetscErrorCode ierr; 1173*86a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 1174*86a27549SJunchao Zhang PetscScalarKokkosView xv; 1175*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1176*86a27549SJunchao Zhang 1177*86a27549SJunchao Zhang PetscFunctionBegin; 1178*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1179*86a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1180*86a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1181*86a27549SJunchao Zhang /* Solve U^T tmpv = b */ 1182*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1183*86a27549SJunchao Zhang 1184*86a27549SJunchao Zhang /* Solve L^T x = tmpv */ 1185*86a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1186*86a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1187*86a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1188*86a27549SJunchao Zhang PetscFunctionReturn(0); 1189*86a27549SJunchao Zhang } 1190*86a27549SJunchao Zhang 1191*86a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1192*86a27549SJunchao Zhang { 1193*86a27549SJunchao Zhang PetscErrorCode ierr; 1194*86a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1195*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1196*86a27549SJunchao Zhang PetscInt fill_lev = info->levels; 1197*86a27549SJunchao Zhang 1198*86a27549SJunchao Zhang PetscFunctionBegin; 1199*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1200*86a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1201*86a27549SJunchao Zhang 1202*86a27549SJunchao Zhang B->assembled = PETSC_TRUE; 1203*86a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 1204*86a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 1205*86a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1206*86a27549SJunchao Zhang B->ops->matsolve = NULL; 1207*86a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 1208*86a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 1209*86a27549SJunchao Zhang 1210*86a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1211*86a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 1212*86a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1213*86a27549SJunchao Zhang PetscFunctionReturn(0); 1214*86a27549SJunchao Zhang } 1215*86a27549SJunchao Zhang 1216*86a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1217*86a27549SJunchao Zhang { 1218*86a27549SJunchao Zhang PetscErrorCode ierr; 1219*86a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1220*86a27549SJunchao Zhang Mat_SeqAIJ *b; 1221*86a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1222*86a27549SJunchao Zhang PetscInt fill_lev = info->levels; 1223*86a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1224*86a27549SJunchao Zhang PetscInt n = A->rmap->n; 1225*86a27549SJunchao Zhang 1226*86a27549SJunchao Zhang PetscFunctionBegin; 1227*86a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1228*86a27549SJunchao Zhang /* Rebuild factors */ 1229*86a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1230*86a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1231*86a27549SJunchao Zhang 1232*86a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 1233*86a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1234*86a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1235*86a27549SJunchao Zhang 1236*86a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 1237*86a27549SJunchao Zhang 1238*86a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1239*86a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1240*86a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 1241*86a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1242*86a27549SJunchao Zhang 1243*86a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1244*86a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1245*86a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1246*86a27549SJunchao Zhang 1247*86a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1248*86a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1249*86a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1250*86a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1251*86a27549SJunchao Zhang 1252*86a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 1253*86a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 1254*86a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1255*86a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1256*86a27549SJunchao Zhang #else 1257*86a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1258*86a27549SJunchao Zhang #endif 1259*86a27549SJunchao Zhang 1260*86a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1261*86a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1262*86a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1263*86a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1264*86a27549SJunchao Zhang 1265*86a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 1266*86a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1267*86a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 1268*86a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1269*86a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1270*86a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1271*86a27549SJunchao Zhang 1272*86a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 1273*86a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1274930e68a5SMark Adams PetscFunctionReturn(0); 1275930e68a5SMark Adams } 1276930e68a5SMark Adams 12778f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 12788f7e8f9dSMark Adams { 12798f7e8f9dSMark Adams PetscErrorCode ierr; 12808f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 12818f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1282930e68a5SMark Adams 12838f7e8f9dSMark Adams PetscFunctionBegin; 12848f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 12858f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 12868f7e8f9dSMark Adams // move B data into Kokkos 12878f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 12888f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 12898f7e8f9dSMark Adams { 12908f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 12918f7e8f9dSMark Adams if (!baijkok->diag_d) { 12928f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1293152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 12948f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 12958f7e8f9dSMark Adams } 12968f7e8f9dSMark Adams } 12978f7e8f9dSMark Adams PetscFunctionReturn(0); 12988f7e8f9dSMark Adams } 12998f7e8f9dSMark Adams 1300*86a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1301930e68a5SMark Adams { 1302930e68a5SMark Adams PetscFunctionBegin; 1303930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1304930e68a5SMark Adams PetscFunctionReturn(0); 1305930e68a5SMark Adams } 1306930e68a5SMark Adams 13078f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice 13088f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 13098f7e8f9dSMark Adams { 13108f7e8f9dSMark Adams PetscFunctionBegin; 13118f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 13128f7e8f9dSMark Adams PetscFunctionReturn(0); 13138f7e8f9dSMark Adams } 13148f7e8f9dSMark Adams 1315930e68a5SMark Adams /*MC 1316*86a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1317*86a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1318930e68a5SMark Adams 1319930e68a5SMark Adams Level: beginner 1320930e68a5SMark Adams 1321*86a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1322930e68a5SMark Adams M*/ 1323*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1324930e68a5SMark Adams { 1325930e68a5SMark Adams PetscErrorCode ierr; 1326930e68a5SMark Adams PetscInt n = A->rmap->n; 1327930e68a5SMark Adams 1328930e68a5SMark Adams PetscFunctionBegin; 1329930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1330930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1331930e68a5SMark Adams (*B)->factortype = ftype; 13324ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1333930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1334930e68a5SMark Adams 13358f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1336930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1337*86a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 1338*86a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1339*86a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 1340*86a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1341*86a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 1342*86a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1343*86a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1344930e68a5SMark Adams 1345930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1346*86a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1347930e68a5SMark Adams PetscFunctionReturn(0); 1348930e68a5SMark Adams } 13498f7e8f9dSMark Adams 13508f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 13518f7e8f9dSMark Adams { 13528f7e8f9dSMark Adams PetscErrorCode ierr; 13538f7e8f9dSMark Adams PetscInt n = A->rmap->n; 13548f7e8f9dSMark Adams 13558f7e8f9dSMark Adams PetscFunctionBegin; 13568f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 13578f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 13588f7e8f9dSMark Adams (*B)->factortype = ftype; 1359f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 13604ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 13618f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 13628f7e8f9dSMark Adams 13638f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 13648f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 13658f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 13668f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 13678f7e8f9dSMark Adams 13688f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 13698f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 13708f7e8f9dSMark Adams PetscFunctionReturn(0); 13718f7e8f9dSMark Adams } 1372*86a27549SJunchao Zhang 1373*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1374*86a27549SJunchao Zhang { 1375*86a27549SJunchao Zhang PetscErrorCode ierr; 1376*86a27549SJunchao Zhang 1377*86a27549SJunchao Zhang PetscFunctionBegin; 1378*86a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1379*86a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1380*86a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1381*86a27549SJunchao Zhang PetscFunctionReturn(0); 1382*86a27549SJunchao Zhang } 1383*86a27549SJunchao Zhang 1384