111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 38c3ff71bSJunchao Zhang #include <petscsystypes.h> 48c3ff71bSJunchao Zhang #include <petscerror.h> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 7f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1086a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1186a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 1286a27549SJunchao Zhang 138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 148c3ff71bSJunchao Zhang 158c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 16a587d139SMark #include <petscmat.h> 178c3ff71bSJunchao Zhang 188c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 218c3ff71bSJunchao Zhang { 228c3ff71bSJunchao Zhang PetscErrorCode ierr; 23a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 248c3ff71bSJunchao Zhang 258c3ff71bSJunchao Zhang PetscFunctionBegin; 268c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 2786a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 28a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 29a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 30a587d139SMark } 318c3ff71bSJunchao Zhang /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 328c3ff71bSJunchao Zhang PetscFunctionReturn(0); 338c3ff71bSJunchao Zhang } 348c3ff71bSJunchao Zhang 3586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 368c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 378c3ff71bSJunchao Zhang { 38152b3e56SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 398c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408c3ff71bSJunchao Zhang PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 418c3ff71bSJunchao Zhang 428c3ff71bSJunchao Zhang PetscFunctionBegin; 4386a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 44152b3e56SJunchao Zhang /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 458c3ff71bSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 468c3ff71bSJunchao Zhang delete aijkok; 478c3ff71bSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 488c3ff71bSJunchao Zhang aijkok->nonzerostate = A->nonzerostate; 498c3ff71bSJunchao Zhang A->spptr = aijkok; 508c3ff71bSJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 5186a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 52152b3e56SJunchao Zhang aijkok->a_dual.modify_host(); /* Mark host is modified */ 53152b3e56SJunchao Zhang aijkok->a_dual.sync_device(); /* Sync the device */ 5486a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 5586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 568c3ff71bSJunchao Zhang } 578c3ff71bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 588c3ff71bSJunchao Zhang PetscFunctionReturn(0); 598c3ff71bSJunchao Zhang } 608c3ff71bSJunchao Zhang 6186a27549SJunchao Zhang /* Mark the CSR data on device is modified */ 6286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 6386a27549SJunchao Zhang { 6486a27549SJunchao Zhang PetscErrorCode ierr; 6586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 6686a27549SJunchao Zhang 6786a27549SJunchao Zhang PetscFunctionBegin; 6886a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 6986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 7086a27549SJunchao Zhang aijkok->a_dual.modify_device(); 7186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 7286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 7386a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_GPU; 7486a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 7586a27549SJunchao Zhang PetscFunctionReturn(0); 7686a27549SJunchao Zhang } 7786a27549SJunchao Zhang 78f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 79f0cf5187SStefano Zampini { 80f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 81f0cf5187SStefano Zampini 82f0cf5187SStefano Zampini PetscFunctionBegin; 83f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 8486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 8586a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 86f0cf5187SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 87f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 8886a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8986a27549SJunchao Zhang aijkok->a_dual.modify_device(); /* Mark device is modified */ 9086a27549SJunchao Zhang aijkok->a_dual.sync_host(); /* Sync the host */ 91f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 92f0cf5187SStefano Zampini } 93f0cf5187SStefano Zampini PetscFunctionReturn(0); 94f0cf5187SStefano Zampini } 95f0cf5187SStefano Zampini 96f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 97f0cf5187SStefano Zampini { 98f0cf5187SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 99f0cf5187SStefano Zampini PetscErrorCode ierr; 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini PetscFunctionBegin; 102f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 103f0cf5187SStefano Zampini *array = a->a; 104f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 105f0cf5187SStefano Zampini PetscFunctionReturn(0); 106f0cf5187SStefano Zampini } 107f0cf5187SStefano Zampini 108a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 109a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 110a587d139SMark { 111a587d139SMark Mat_SeqAIJKokkos *aijkok; 112a587d139SMark Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 113a587d139SMark 114a587d139SMark PetscFunctionBegin; 115a587d139SMark // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 116a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 117a587d139SMark if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 118152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 119a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 120a587d139SMark PetscFunctionReturn(0); 121a587d139SMark } 122a587d139SMark 123a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 124a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 125a587d139SMark { 126a587d139SMark Mat_SeqAIJKokkos *aijkok; 127a587d139SMark 128a587d139SMark PetscFunctionBegin; 129a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 131a587d139SMark *d_mat = aijkok->device_mat_d.data(); 132a587d139SMark } else { 133a587d139SMark PetscErrorCode ierr; 134a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 135a587d139SMark *d_mat = NULL; 136a587d139SMark } 137a587d139SMark PetscFunctionReturn(0); 138a587d139SMark } 139152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 140152b3e56SJunchao Zhang { 141152b3e56SJunchao Zhang PetscErrorCode ierr; 142152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 143152b3e56SJunchao Zhang 144152b3e56SJunchao Zhang PetscFunctionBegin; 145152b3e56SJunchao Zhang if (!aijkok->At) { /* Generate At for the first time */ 146152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 147152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 14886a27549SJunchao Zhang } else if (!aijkok->transpose_updated) { /* Only update At values */ 149152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 150152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 151152b3e56SJunchao Zhang } 15286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 153152b3e56SJunchao Zhang PetscFunctionReturn(0); 154152b3e56SJunchao Zhang } 155152b3e56SJunchao Zhang 15686a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 157152b3e56SJunchao Zhang { 158152b3e56SJunchao Zhang PetscErrorCode ierr; 159152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 160152b3e56SJunchao Zhang 161152b3e56SJunchao Zhang PetscFunctionBegin; 162152b3e56SJunchao Zhang if (!aijkok->Ah) { /* Generate Ah for the first time */ 163152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 164152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 165152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 16686a27549SJunchao Zhang } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 167152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 168152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 169152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 170152b3e56SJunchao Zhang } 17186a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 172152b3e56SJunchao Zhang PetscFunctionReturn(0); 173152b3e56SJunchao Zhang } 174a587d139SMark 1758c3ff71bSJunchao Zhang /* y = A x */ 1768c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1778c3ff71bSJunchao Zhang { 1788c3ff71bSJunchao Zhang PetscErrorCode ierr; 1798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 180152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 181152b3e56SJunchao Zhang PetscScalarKokkosView yv; 1828c3ff71bSJunchao Zhang 1838c3ff71bSJunchao Zhang PetscFunctionBegin; 1848c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 185152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 186152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 1878c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 189152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 190152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 191bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 192152b3e56SJunchao Zhang /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 193152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 1948c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1958c3ff71bSJunchao Zhang } 1968c3ff71bSJunchao Zhang 1978c3ff71bSJunchao Zhang /* y = A^T x */ 1988c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1998c3ff71bSJunchao Zhang { 2008c3ff71bSJunchao Zhang PetscErrorCode ierr; 2018c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 202152b3e56SJunchao Zhang Mat B; 203152b3e56SJunchao Zhang const char *mode; 204152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 205152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2068c3ff71bSJunchao Zhang 2078c3ff71bSJunchao Zhang PetscFunctionBegin; 2088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 209152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 210152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 211152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 212152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 213152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 214152b3e56SJunchao Zhang mode = "N"; 215152b3e56SJunchao Zhang } else { 216152b3e56SJunchao Zhang B = A; 217152b3e56SJunchao Zhang mode = "T"; 218152b3e56SJunchao Zhang } 219152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 220152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 221152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 222152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 223bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 224152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2258c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2268c3ff71bSJunchao Zhang } 2278c3ff71bSJunchao Zhang 2288c3ff71bSJunchao Zhang /* y = A^H x */ 2298c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2308c3ff71bSJunchao Zhang { 2318c3ff71bSJunchao Zhang PetscErrorCode ierr; 2328c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 233152b3e56SJunchao Zhang Mat B; 234152b3e56SJunchao Zhang const char *mode; 235152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 236152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2378c3ff71bSJunchao Zhang 2388c3ff71bSJunchao Zhang PetscFunctionBegin; 2398c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 240152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 241152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 242152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 24386a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 244152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 245152b3e56SJunchao Zhang mode = "N"; 246152b3e56SJunchao Zhang } else { 247152b3e56SJunchao Zhang B = A; 248152b3e56SJunchao Zhang mode = "C"; 249152b3e56SJunchao Zhang } 250152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 251152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 252152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 254bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 255152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2578c3ff71bSJunchao Zhang } 2588c3ff71bSJunchao Zhang 2598c3ff71bSJunchao Zhang /* z = A x + y */ 2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 2618c3ff71bSJunchao Zhang { 2628c3ff71bSJunchao Zhang PetscErrorCode ierr; 2638c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 264152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 265152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2668c3ff71bSJunchao Zhang 2678c3ff71bSJunchao Zhang PetscFunctionBegin; 2688c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 269152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 270152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 271152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2728c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2738c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 274152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 275152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 276152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 277152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 278bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 279152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2808c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2818c3ff71bSJunchao Zhang } 2828c3ff71bSJunchao Zhang 2838c3ff71bSJunchao Zhang /* z = A^T x + y */ 2848c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2858c3ff71bSJunchao Zhang { 2868c3ff71bSJunchao Zhang PetscErrorCode ierr; 2878c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 288152b3e56SJunchao Zhang Mat B; 289152b3e56SJunchao Zhang const char *mode; 290152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 291152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2928c3ff71bSJunchao Zhang 2938c3ff71bSJunchao Zhang PetscFunctionBegin; 2948c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 295152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 296152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 297152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2988c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 299152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 300152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 301152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 302152b3e56SJunchao Zhang mode = "N"; 303152b3e56SJunchao Zhang } else { 304152b3e56SJunchao Zhang B = A; 305152b3e56SJunchao Zhang mode = "T"; 306152b3e56SJunchao Zhang } 307152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 308152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 309152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 310152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 311152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 312bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 313152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3148c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3158c3ff71bSJunchao Zhang } 3168c3ff71bSJunchao Zhang 3178c3ff71bSJunchao Zhang /* z = A^H x + y */ 3188c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3198c3ff71bSJunchao Zhang { 3208c3ff71bSJunchao Zhang PetscErrorCode ierr; 3218c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 322152b3e56SJunchao Zhang Mat B; 323152b3e56SJunchao Zhang const char *mode; 324152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 325152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3268c3ff71bSJunchao Zhang 3278c3ff71bSJunchao Zhang PetscFunctionBegin; 3288c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 329152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 330152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 331152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3328c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 333152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 33486a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 335152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 336152b3e56SJunchao Zhang mode = "N"; 337152b3e56SJunchao Zhang } else { 338152b3e56SJunchao Zhang B = A; 339152b3e56SJunchao Zhang mode = "C"; 340152b3e56SJunchao Zhang } 341152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 342152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 343152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 346bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348152b3e56SJunchao Zhang PetscFunctionReturn(0); 349152b3e56SJunchao Zhang } 350152b3e56SJunchao Zhang 351152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 352152b3e56SJunchao Zhang { 353152b3e56SJunchao Zhang PetscErrorCode ierr; 354152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355152b3e56SJunchao Zhang 356152b3e56SJunchao Zhang PetscFunctionBegin; 357152b3e56SJunchao Zhang switch (op) { 358152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 359152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 360152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 361152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 362152b3e56SJunchao Zhang break; 363152b3e56SJunchao Zhang default: 364152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 365152b3e56SJunchao Zhang break; 366152b3e56SJunchao Zhang } 3678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3688c3ff71bSJunchao Zhang } 3698c3ff71bSJunchao Zhang 3703d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3718c3ff71bSJunchao Zhang { 3728c3ff71bSJunchao Zhang PetscErrorCode ierr; 3738c3ff71bSJunchao Zhang Mat B; 374c58ef05eSStefano Zampini Mat_SeqAIJ *aij; 3758c3ff71bSJunchao Zhang 3768c3ff71bSJunchao Zhang PetscFunctionBegin; 377a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 3788c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 3798c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3808c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 3818c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang } 3838c3ff71bSJunchao Zhang 3848c3ff71bSJunchao Zhang B = *newmat; 3858c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 38686a27549SJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 38786a27549SJunchao Zhang 3888c3ff71bSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 3890d8bce8aSStefano Zampini ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 390f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 391c58ef05eSStefano Zampini /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 392c58ef05eSStefano Zampini aij = (Mat_SeqAIJ*)B->data; 393c58ef05eSStefano Zampini aij->inode.use = PETSC_FALSE; 3948c3ff71bSJunchao Zhang 3958c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3968c3ff71bSJunchao Zhang } 3978c3ff71bSJunchao Zhang 3988c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 3998c3ff71bSJunchao Zhang { 4008c3ff71bSJunchao Zhang PetscErrorCode ierr; 4018c3ff71bSJunchao Zhang 4028c3ff71bSJunchao Zhang PetscFunctionBegin; 4038c3ff71bSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 4048c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 4058c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4068c3ff71bSJunchao Zhang } 4078c3ff71bSJunchao Zhang 4088c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 4098c3ff71bSJunchao Zhang { 4108c3ff71bSJunchao Zhang PetscErrorCode ierr; 41186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 4128c3ff71bSJunchao Zhang 4138c3ff71bSJunchao Zhang PetscFunctionBegin; 41486a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 41586a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 4168f7e8f9dSMark Adams if (aijkok) { 4178f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 418a587d139SMark delete aijkok->colmap_d; 419a587d139SMark delete aijkok->i_uncompressed_d; 420a587d139SMark } 4218f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 4228f7e8f9dSMark Adams } 4238c3ff71bSJunchao Zhang delete aijkok; 42486a27549SJunchao Zhang } else { 42586a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 42686a27549SJunchao Zhang } 427f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 428152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 429152b3e56SJunchao Zhang A->spptr = NULL; 4308c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 4318c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4328c3ff71bSJunchao Zhang } 4338c3ff71bSJunchao Zhang 43486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 43586a27549SJunchao Zhang { 43686a27549SJunchao Zhang PetscErrorCode ierr; 43786a27549SJunchao Zhang 43886a27549SJunchao Zhang PetscFunctionBegin; 43986a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 44086a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 44186a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 44286a27549SJunchao Zhang PetscFunctionReturn(0); 44386a27549SJunchao Zhang } 44486a27549SJunchao Zhang 445a3f881fbSStefano Zampini #if 0 446a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 447a3f881fbSStefano Zampini { 448a3f881fbSStefano Zampini MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 449a3f881fbSStefano Zampini 450a3f881fbSStefano Zampini PetscFunctionBegin; 451a3f881fbSStefano Zampini delete kh; 452a3f881fbSStefano Zampini PetscFunctionReturn(0); 453a3f881fbSStefano Zampini } 454a3f881fbSStefano Zampini 455a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 456a3f881fbSStefano Zampini { 457a3f881fbSStefano Zampini Mat_Product *product = C->product; 458a3f881fbSStefano Zampini Mat A,B; 459a3f881fbSStefano Zampini MatProductType ptype; 460a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 461a3f881fbSStefano Zampini bool tA,tB; 462a3f881fbSStefano Zampini PetscErrorCode ierr; 463a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 464a3f881fbSStefano Zampini Mat_SeqAIJ *c; 465a3f881fbSStefano Zampini 466a3f881fbSStefano Zampini PetscFunctionBegin; 467a3f881fbSStefano Zampini MatCheckProduct(C,1); 468a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 469a3f881fbSStefano Zampini A = product->A; 470a3f881fbSStefano Zampini B = product->B; 471a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 472a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 473a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 474a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 475a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 476a3f881fbSStefano Zampini kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 477a3f881fbSStefano Zampini ptype = product->type; 478a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 479a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 480a3f881fbSStefano Zampini switch (ptype) { 481a3f881fbSStefano Zampini case MATPRODUCT_AB: 482a3f881fbSStefano Zampini tA = false; 483a3f881fbSStefano Zampini tB = false; 484a3f881fbSStefano Zampini break; 485a3f881fbSStefano Zampini case MATPRODUCT_AtB: 486a3f881fbSStefano Zampini tA = true; 487a3f881fbSStefano Zampini tB = false; 488a3f881fbSStefano Zampini break; 489a3f881fbSStefano Zampini case MATPRODUCT_ABt: 490a3f881fbSStefano Zampini tA = false; 491a3f881fbSStefano Zampini tB = true; 492a3f881fbSStefano Zampini break; 493a3f881fbSStefano Zampini default: 494a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 495a3f881fbSStefano Zampini } 496a3f881fbSStefano Zampini 497a3f881fbSStefano Zampini KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 498a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 499a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 500a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 501a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 502a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 503a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 504a3f881fbSStefano Zampini c->reallocs = 0; 505a3f881fbSStefano Zampini C->info.mallocs += 0; 506a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 507a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 508a3f881fbSStefano Zampini C->num_ass++; 509a3f881fbSStefano Zampini /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 510a3f881fbSStefano Zampini // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 511a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_BOTH; 512a3f881fbSStefano Zampini // Also, we should add support to copy back from device to host 513a3f881fbSStefano Zampini PetscFunctionReturn(0); 514a3f881fbSStefano Zampini } 515a3f881fbSStefano Zampini 516a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 517a3f881fbSStefano Zampini { 518a3f881fbSStefano Zampini Mat_Product *product = C->product; 519a3f881fbSStefano Zampini Mat A,B; 520a3f881fbSStefano Zampini MatProductType ptype; 521a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 522a3f881fbSStefano Zampini PetscInt m,n,k; 523a3f881fbSStefano Zampini bool tA,tB; 524a3f881fbSStefano Zampini PetscErrorCode ierr; 525a3f881fbSStefano Zampini Mat_SeqAIJ *c; 526a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 527a3f881fbSStefano Zampini 528a3f881fbSStefano Zampini PetscFunctionBegin; 529a3f881fbSStefano Zampini MatCheckProduct(C,1); 530a3f881fbSStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 531a3f881fbSStefano Zampini A = product->A; 532a3f881fbSStefano Zampini B = product->B; 533a3f881fbSStefano Zampini // TODO only copy the i,j data, not the values 534a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 535a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 536a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 538a3f881fbSStefano Zampini ptype = product->type; 539a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 540a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 541a3f881fbSStefano Zampini switch (ptype) { 542a3f881fbSStefano Zampini case MATPRODUCT_AB: 543a3f881fbSStefano Zampini tA = false; 544a3f881fbSStefano Zampini tB = false; 545a3f881fbSStefano Zampini m = A->rmap->n; 546a3f881fbSStefano Zampini n = B->cmap->n; 547a3f881fbSStefano Zampini break; 548a3f881fbSStefano Zampini case MATPRODUCT_AtB: 549a3f881fbSStefano Zampini tA = true; 550a3f881fbSStefano Zampini tB = false; 551a3f881fbSStefano Zampini m = A->cmap->n; 552a3f881fbSStefano Zampini n = B->cmap->n; 553a3f881fbSStefano Zampini break; 554a3f881fbSStefano Zampini case MATPRODUCT_ABt: 555a3f881fbSStefano Zampini tA = false; 556a3f881fbSStefano Zampini tB = true; 557a3f881fbSStefano Zampini m = A->rmap->n; 558a3f881fbSStefano Zampini n = B->rmap->n; 559a3f881fbSStefano Zampini break; 560a3f881fbSStefano Zampini default: 561a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 562a3f881fbSStefano Zampini } 563a3f881fbSStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 564a3f881fbSStefano Zampini ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 565a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 566a3f881fbSStefano Zampini 567a3f881fbSStefano Zampini kh = new MatMatKernelHandle_t; 568a3f881fbSStefano Zampini // TODO SZ: ADD RUNTIME SELECTION OF THESE 569a3f881fbSStefano Zampini kh->set_team_work_size(16); 570a3f881fbSStefano Zampini kh->set_dynamic_scheduling(true); 571a3f881fbSStefano Zampini // Select an spgemm algorithm, limited by configuration at compile-time and 572a3f881fbSStefano Zampini // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 573a3f881fbSStefano Zampini // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 574a3f881fbSStefano Zampini std::string myalg("SPGEMM_KK_MEMORY"); 575a3f881fbSStefano Zampini kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 576a3f881fbSStefano Zampini 577a3f881fbSStefano Zampini ///////////////////////////////////// 578a3f881fbSStefano Zampini // TODO JZ 579a3f881fbSStefano Zampini ckok = NULL; //new Mat_SeqAIJKokkos(); 580a3f881fbSStefano Zampini C->spptr = ckok; 581a3f881fbSStefano Zampini KokkosCsrMatrix_t ccsr; // here only to have the code compile 582a3f881fbSStefano Zampini KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 583a3f881fbSStefano Zampini //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 584a3f881fbSStefano Zampini //c->nz = get_nnz_from_ccsr 585a3f881fbSStefano Zampini ////////////////////////////////////// 586a3f881fbSStefano Zampini c->singlemalloc = PETSC_FALSE; 587a3f881fbSStefano Zampini c->free_a = PETSC_TRUE; 588a3f881fbSStefano Zampini c->free_ij = PETSC_TRUE; 589a3f881fbSStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 590a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 591a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 592a3f881fbSStefano Zampini //////////////////////////////////// 593a3f881fbSStefano Zampini // TODO JZ copy from device to c->i and c->j 594a3f881fbSStefano Zampini //////////////////////////////////// 595a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 596a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 597a3f881fbSStefano Zampini c->maxnz = c->nz; 598a3f881fbSStefano Zampini c->nonzerorowcnt = 0; 599a3f881fbSStefano Zampini c->rmax = 0; 600a3f881fbSStefano Zampini for (k = 0; k < m; k++) { 601a3f881fbSStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 602a3f881fbSStefano Zampini c->ilen[k] = c->imax[k] = nn; 603a3f881fbSStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 604a3f881fbSStefano Zampini c->rmax = PetscMax(c->rmax,nn); 605a3f881fbSStefano Zampini } 606a3f881fbSStefano Zampini 607a3f881fbSStefano Zampini C->nonzerostate++; 608a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 609a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 610a3f881fbSStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 611a3f881fbSStefano Zampini ckok->nonzerostate = C->nonzerostate; 612a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 613a3f881fbSStefano Zampini C->preallocated = PETSC_TRUE; 614a3f881fbSStefano Zampini C->assembled = PETSC_FALSE; 615a3f881fbSStefano Zampini C->was_assembled = PETSC_FALSE; 616a3f881fbSStefano Zampini 617a3f881fbSStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 618a3f881fbSStefano Zampini C->product->data = kh; 619a3f881fbSStefano Zampini C->product->destroy = MatMatKernelHandleDestroy_Private; 620a3f881fbSStefano Zampini PetscFunctionReturn(0); 621a3f881fbSStefano Zampini } 622a3f881fbSStefano Zampini 623a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 624a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 625a3f881fbSStefano Zampini { 626a3f881fbSStefano Zampini Mat_Product *product = mat->product; 627a3f881fbSStefano Zampini PetscErrorCode ierr; 628a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 629a3f881fbSStefano Zampini 630a3f881fbSStefano Zampini PetscFunctionBegin; 631a3f881fbSStefano Zampini MatCheckProduct(mat,1); 632a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 633a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 634a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 635a3f881fbSStefano Zampini } 636a3f881fbSStefano Zampini if (Biskok && Ciskok) { 637a3f881fbSStefano Zampini switch (product->type) { 638a3f881fbSStefano Zampini case MATPRODUCT_AB: 639a3f881fbSStefano Zampini case MATPRODUCT_AtB: 640a3f881fbSStefano Zampini case MATPRODUCT_ABt: 641a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 642a3f881fbSStefano Zampini break; 643a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 644a3f881fbSStefano Zampini case MATPRODUCT_RARt: 645a3f881fbSStefano Zampini case MATPRODUCT_ABC: 646a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 647a3f881fbSStefano Zampini break; 648a3f881fbSStefano Zampini default: 649a3f881fbSStefano Zampini break; 650a3f881fbSStefano Zampini } 651a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 652a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 653a3f881fbSStefano Zampini } 654a3f881fbSStefano Zampini PetscFunctionReturn(0); 655a3f881fbSStefano Zampini } 656a3f881fbSStefano Zampini #endif 657a3f881fbSStefano Zampini 658a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 659a587d139SMark { 660a587d139SMark PetscErrorCode ierr; 661a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 662a587d139SMark PetscFunctionBegin; 66386a27549SJunchao Zhang if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 664a587d139SMark ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 665a587d139SMark PetscFunctionReturn(0); 666a587d139SMark } 667a587d139SMark 668f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 669f0cf5187SStefano Zampini { 670f0cf5187SStefano Zampini PetscErrorCode ierr; 671f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 672f0cf5187SStefano Zampini 673f0cf5187SStefano Zampini PetscFunctionBegin; 674f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 675f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 676f0cf5187SStefano Zampini KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 677f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 678f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 679f0cf5187SStefano Zampini ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 680f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 681f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 682f0cf5187SStefano Zampini PetscFunctionReturn(0); 683f0cf5187SStefano Zampini } 684f0cf5187SStefano Zampini 685a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 686a587d139SMark { 687a587d139SMark PetscErrorCode ierr; 688a587d139SMark PetscBool both = PETSC_FALSE; 689a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 690a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 691a587d139SMark 692a587d139SMark PetscFunctionBegin; 693a587d139SMark if (aijkok && aijkok->a_d.data()) { 694f0cf5187SStefano Zampini KokkosBlas::fill(aijkok->a_d,0.0); 695a587d139SMark both = PETSC_TRUE; 696a587d139SMark } 697a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 698a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 699a587d139SMark if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 700a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 701a587d139SMark PetscFunctionReturn(0); 702a587d139SMark } 703a587d139SMark 704*db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 705*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 706*db78de30SJunchao Zhang { 707*db78de30SJunchao Zhang PetscErrorCode ierr; 708*db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 709*db78de30SJunchao Zhang 710*db78de30SJunchao Zhang PetscFunctionBegin; 711*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 712*db78de30SJunchao Zhang PetscValidPointer(kv,2); 713*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 714*db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 715*db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 716*db78de30SJunchao Zhang *kv = aijkok->a_d; 717*db78de30SJunchao Zhang PetscFunctionReturn(0); 718*db78de30SJunchao Zhang } 719*db78de30SJunchao Zhang 720*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 721*db78de30SJunchao Zhang { 722*db78de30SJunchao Zhang PetscFunctionBegin; 723*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 724*db78de30SJunchao Zhang PetscValidPointer(kv,2); 725*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 726*db78de30SJunchao Zhang PetscFunctionReturn(0); 727*db78de30SJunchao Zhang } 728*db78de30SJunchao Zhang 729*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 730*db78de30SJunchao Zhang { 731*db78de30SJunchao Zhang PetscErrorCode ierr; 732*db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 733*db78de30SJunchao Zhang 734*db78de30SJunchao Zhang PetscFunctionBegin; 735*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 736*db78de30SJunchao Zhang PetscValidPointer(kv,2); 737*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 738*db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 739*db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 740*db78de30SJunchao Zhang *kv = aijkok->a_d; 741*db78de30SJunchao Zhang PetscFunctionReturn(0); 742*db78de30SJunchao Zhang } 743*db78de30SJunchao Zhang 744*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 745*db78de30SJunchao Zhang { 746*db78de30SJunchao Zhang PetscErrorCode ierr; 747*db78de30SJunchao Zhang 748*db78de30SJunchao Zhang PetscFunctionBegin; 749*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 750*db78de30SJunchao Zhang PetscValidPointer(kv,2); 751*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 752*db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 753*db78de30SJunchao Zhang PetscFunctionReturn(0); 754*db78de30SJunchao Zhang } 755*db78de30SJunchao Zhang 756*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 757*db78de30SJunchao Zhang { 758*db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 759*db78de30SJunchao Zhang 760*db78de30SJunchao Zhang PetscFunctionBegin; 761*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 762*db78de30SJunchao Zhang PetscValidPointer(kv,2); 763*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 764*db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 765*db78de30SJunchao Zhang *kv = aijkok->a_d; 766*db78de30SJunchao Zhang PetscFunctionReturn(0); 767*db78de30SJunchao Zhang } 768*db78de30SJunchao Zhang 769*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 770*db78de30SJunchao Zhang { 771*db78de30SJunchao Zhang PetscErrorCode ierr; 772*db78de30SJunchao Zhang 773*db78de30SJunchao Zhang PetscFunctionBegin; 774*db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 775*db78de30SJunchao Zhang PetscValidPointer(kv,2); 776*db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 777*db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 778*db78de30SJunchao Zhang PetscFunctionReturn(0); 779*db78de30SJunchao Zhang } 780*db78de30SJunchao Zhang 781*db78de30SJunchao Zhang /* Computes Y += a*X */ 782f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 783a587d139SMark { 784a587d139SMark PetscErrorCode ierr; 785a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 786a587d139SMark 787a587d139SMark PetscFunctionBegin; 788f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 789a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 790a587d139SMark PetscFunctionReturn(0); 791a587d139SMark } 792*db78de30SJunchao Zhang 793f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 794a587d139SMark PetscBool e; 795a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 796a587d139SMark if (e) { 797a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 798f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 799a587d139SMark } 800a587d139SMark } 801*db78de30SJunchao Zhang 802a587d139SMark if (str != SAME_NONZERO_PATTERN) { 803*db78de30SJunchao Zhang /* TODO: do MatAXPY on device */ 804a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 805a587d139SMark PetscFunctionReturn(0); 806a587d139SMark } else { 807*db78de30SJunchao Zhang ConstPetscScalarKokkosView xv; 808*db78de30SJunchao Zhang PetscScalarKokkosView yv; 809*db78de30SJunchao Zhang 810*db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 811*db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 812*db78de30SJunchao Zhang KokkosBlas::axpy(a,xv,yv); 813*db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 814*db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 815a587d139SMark } 816a587d139SMark PetscFunctionReturn(0); 817a587d139SMark } 818a587d139SMark 81986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 8208f7e8f9dSMark Adams { 8218f7e8f9dSMark Adams PetscErrorCode ierr; 8228f7e8f9dSMark Adams 8238f7e8f9dSMark Adams PetscFunctionBegin; 8248f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 8258f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 8268f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 8278f7e8f9dSMark Adams PetscFunctionReturn(0); 8288f7e8f9dSMark Adams } 8298f7e8f9dSMark Adams 8308c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 8318c3ff71bSJunchao Zhang { 8328c3ff71bSJunchao Zhang PetscFunctionBegin; 8336f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 8346f3d89d0SStefano Zampini 835a587d139SMark A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 8368c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 8378c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 8388c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 839a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 840f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 841a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 842a3f881fbSStefano Zampini //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 8438c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 8448c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 8458c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 8468c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 8478c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 8488c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 849152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 8508c3ff71bSJunchao Zhang PetscFunctionReturn(0); 8518c3ff71bSJunchao Zhang } 8528c3ff71bSJunchao Zhang 8538c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 854152b3e56SJunchao Zhang /*@C 8558c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 8568c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 8578c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 8588c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 8598c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 8608c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 8618c3ff71bSJunchao Zhang 8628c3ff71bSJunchao Zhang Collective 8638c3ff71bSJunchao Zhang 8648c3ff71bSJunchao Zhang Input Parameters: 8658c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 8668c3ff71bSJunchao Zhang . m - number of rows 8678c3ff71bSJunchao Zhang . n - number of columns 8688c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 8698c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 8708c3ff71bSJunchao Zhang (possibly different for each row) or NULL 8718c3ff71bSJunchao Zhang 8728c3ff71bSJunchao Zhang Output Parameter: 8738c3ff71bSJunchao Zhang . A - the matrix 8748c3ff71bSJunchao Zhang 8758c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 8768c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 8778c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 8788c3ff71bSJunchao Zhang 8798c3ff71bSJunchao Zhang Notes: 8808c3ff71bSJunchao Zhang If nnz is given then nz is ignored 8818c3ff71bSJunchao Zhang 8828c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 8838c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 8848c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 8858c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 8868c3ff71bSJunchao Zhang 8878c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 8888c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 8898c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 8908c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 8918c3ff71bSJunchao Zhang 8928c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 8938c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 8948c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 8958c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 8968c3ff71bSJunchao Zhang 8978c3ff71bSJunchao Zhang Level: intermediate 8988c3ff71bSJunchao Zhang 8998c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 9008c3ff71bSJunchao Zhang @*/ 9018c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 9028c3ff71bSJunchao Zhang { 9038c3ff71bSJunchao Zhang PetscErrorCode ierr; 9048c3ff71bSJunchao Zhang 9058c3ff71bSJunchao Zhang PetscFunctionBegin; 9068c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 9078c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 9088c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 9098c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 9108c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 9118c3ff71bSJunchao Zhang PetscFunctionReturn(0); 9128c3ff71bSJunchao Zhang } 913930e68a5SMark Adams 914930e68a5SMark Adams // factorizations 9158f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 9168f7e8f9dSMark Adams // 9178f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 9188f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 9198f7e8f9dSMark Adams // 9208f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 921930e68a5SMark Adams { 9228f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 9238f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 9248f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 925930e68a5SMark Adams PetscErrorCode ierr; 9268f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 9278f7e8f9dSMark Adams const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 9288f7e8f9dSMark Adams const PetscScalar *aa_d = aijkok->a_d.data(); 9298f7e8f9dSMark Adams PetscScalar *ba_d = baijkok->a_d.data(); 9308f7e8f9dSMark Adams PetscBool row_identity,col_identity; 9318f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 9328f7e8f9dSMark Adams PetscContainer container; 933930e68a5SMark Adams 934930e68a5SMark Adams PetscFunctionBegin; 9358f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 9368f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 9378f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 9388f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 9398f7e8f9dSMark Adams if (container) { 9408f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 9418f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 9428f7e8f9dSMark Adams Nf = (*pNf)%1000; 9438f7e8f9dSMark Adams nv = (*pNf)/1000; 9448f7e8f9dSMark Adams if (nv>0) nVec = nv; 9458f7e8f9dSMark Adams } else Nf = 1; 9468f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 9478f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 9488f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 9498f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 9508f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 9518f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 9528f7e8f9dSMark Adams #endif 9538f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 9548f7e8f9dSMark Adams { 9558f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 9568f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 9578f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 9588f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 9598f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 9608f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 9618f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 9628f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 9638f7e8f9dSMark Adams size_t flops_h = 0.0; 9648f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 9658f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 9668f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 9678f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 9688f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 9698f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 9708f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 9718f7e8f9dSMark Adams // Fill A --> fact 9728f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 9738f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 9748f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 9758f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 9768f7e8f9dSMark Adams // zero rows of B 9778f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9788f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 9798f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 9808f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 9818f7e8f9dSMark Adams /* zero (unfactored row) */ 9828f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 9838f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 9848f7e8f9dSMark Adams }); 9858f7e8f9dSMark Adams // copy A into B 9868f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9878f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 9888f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 9898f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 9908f7e8f9dSMark Adams /* load in initial (unfactored row) */ 9918f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 9928f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 9938f7e8f9dSMark Adams PetscScalar vala = av[j]; 9948f7e8f9dSMark Adams if (colb == rowb) { 9958f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 9968f7e8f9dSMark Adams } else { 9978f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9988f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9998f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 10008f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 10018f7e8f9dSMark Adams if (pbj[j] == colb) { 10028f7e8f9dSMark Adams pba[j] = vala; 10038f7e8f9dSMark Adams set++; 10048f7e8f9dSMark Adams break; 10058f7e8f9dSMark Adams } 10068f7e8f9dSMark Adams } 10078f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 10088f7e8f9dSMark Adams } 10098f7e8f9dSMark Adams } 10108f7e8f9dSMark Adams }); 10118f7e8f9dSMark Adams }); 10128f7e8f9dSMark Adams Kokkos::fence(); 1013930e68a5SMark Adams 10148f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 10158f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 10168f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 10178f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 10188f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 10198f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 10208f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 10218f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 10228f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 10238f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 10248f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 10258f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 10268f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 10278f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 10288f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 10298f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 10308f7e8f9dSMark Adams else { 10318f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 10328f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 10338f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 10348f7e8f9dSMark Adams size_t st_idx; 10358f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 10368f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 10378f7e8f9dSMark Adams // get column, there has got to be a better way 10388f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 10398f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 10408f7e8f9dSMark Adams if (pjL[j] == ii) { 10418f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 10428f7e8f9dSMark Adams idx = j; // output 10438f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 10448f7e8f9dSMark Adams } 10458f7e8f9dSMark Adams }, st_idx); 10468f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 10478f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 10488f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 10498f7e8f9dSMark Adams // U(i+1,:end) 10508f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 10518f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 10528f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 10538f7e8f9dSMark Adams if (col==myk) { 10548f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 10558f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 10568f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 10578f7e8f9dSMark Adams } else { 10588f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 10598f7e8f9dSMark Adams PetscInt high, low; 10608f7e8f9dSMark Adams const PetscInt *startj; 10618f7e8f9dSMark Adams if (col<myk) { // L 10628f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 10638f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 10648f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 10658f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 10668f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 10678f7e8f9dSMark Adams } else { 10688f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 10698f7e8f9dSMark Adams start = ba_d + idx; 10708f7e8f9dSMark Adams startj= bj_d + idx; 10718f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 10728f7e8f9dSMark Adams } 10738f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 10748f7e8f9dSMark Adams low = 0; 10758f7e8f9dSMark Adams high = (PetscInt)(end-start); 10768f7e8f9dSMark Adams while (high-low > 5) { 10778f7e8f9dSMark Adams int t = (low+high)/2; 10788f7e8f9dSMark Adams if (startj[t] > col) high = t; 10798f7e8f9dSMark Adams else low = t; 10808f7e8f9dSMark Adams } 10818f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 10828f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 10838f7e8f9dSMark Adams } 10848f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 10858f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 10868f7e8f9dSMark Adams } 10878f7e8f9dSMark Adams }); 10888f7e8f9dSMark Adams } 10898f7e8f9dSMark Adams } 10908f7e8f9dSMark Adams }); 10918f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 10928f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 10938f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 10948f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 10958f7e8f9dSMark Adams }); 10968f7e8f9dSMark Adams Kokkos::fence(); 10978f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 10988f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 10998f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 11008f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 11018f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 11028f7e8f9dSMark Adams #endif 11038f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 11048f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 11058f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 11068f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 11078f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 11088f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 11098f7e8f9dSMark Adams if (i < end) { 11108f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 11118f7e8f9dSMark Adams *pv = 1.0/(*pv); 11128f7e8f9dSMark Adams } 11138f7e8f9dSMark Adams }); 11148f7e8f9dSMark Adams }); 11158f7e8f9dSMark Adams } 11168f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 11178f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 11188f7e8f9dSMark Adams #endif 11198f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 11208f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 11218f7e8f9dSMark Adams 11228f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 11238f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 11248f7e8f9dSMark Adams if (b->inode.size) { 11258f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 11268f7e8f9dSMark Adams } else if (row_identity && col_identity) { 11278f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 11288f7e8f9dSMark Adams } else { 11298f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 11308f7e8f9dSMark Adams } 11318f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 11328f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 11338f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 11348f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 11358f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 11368f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 11378f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 11388f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 11398f7e8f9dSMark Adams 1140930e68a5SMark Adams PetscFunctionReturn(0); 1141930e68a5SMark Adams } 1142930e68a5SMark Adams 114386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1144930e68a5SMark Adams { 1145930e68a5SMark Adams PetscErrorCode ierr; 1146930e68a5SMark Adams 1147930e68a5SMark Adams PetscFunctionBegin; 1148930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 114986a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 115086a27549SJunchao Zhang PetscFunctionReturn(0); 115186a27549SJunchao Zhang } 115286a27549SJunchao Zhang 115386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 115486a27549SJunchao Zhang { 115586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 115686a27549SJunchao Zhang 115786a27549SJunchao Zhang PetscFunctionBegin; 115886a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 115986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 116086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 116186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 116286a27549SJunchao Zhang } 116386a27549SJunchao Zhang PetscFunctionReturn(0); 116486a27549SJunchao Zhang } 116586a27549SJunchao Zhang 116686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 116786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 116886a27549SJunchao Zhang { 116986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 117086a27549SJunchao Zhang MatColumnIndexType n = A->rmap->n; 117186a27549SJunchao Zhang 117286a27549SJunchao Zhang PetscFunctionBegin; 117386a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 117486a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 117586a27549SJunchao Zhang factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 117686a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 117786a27549SJunchao Zhang factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 117886a27549SJunchao Zhang factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 117986a27549SJunchao Zhang 118086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 118186a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 118286a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 118386a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 118486a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 118586a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 118686a27549SJunchao Zhang 118786a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 118886a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 118986a27549SJunchao Zhang */ 119086a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 119186a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 119286a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 119386a27549SJunchao Zhang 119486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 119586a27549SJunchao Zhang 119686a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 119786a27549SJunchao Zhang factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 119886a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 119986a27549SJunchao Zhang factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 120086a27549SJunchao Zhang factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 120186a27549SJunchao Zhang 120286a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 120386a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 120486a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 120586a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 120686a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 120786a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 120886a27549SJunchao Zhang 120986a27549SJunchao Zhang /* Sort indices. See comments above */ 121086a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 121186a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 121286a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 121386a27549SJunchao Zhang 121486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 121586a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 121686a27549SJunchao Zhang } 121786a27549SJunchao Zhang PetscFunctionReturn(0); 121886a27549SJunchao Zhang } 121986a27549SJunchao Zhang 122086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 122186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 122286a27549SJunchao Zhang { 122386a27549SJunchao Zhang PetscErrorCode ierr; 122486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 122586a27549SJunchao Zhang PetscScalarKokkosView xv; 122686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 122786a27549SJunchao Zhang 122886a27549SJunchao Zhang PetscFunctionBegin; 122986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 123086a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 123186a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 123286a27549SJunchao Zhang /* Solve L tmpv = b */ 123386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector); 123486a27549SJunchao Zhang /* Solve Ux = tmpv */ 123586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv); 123686a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 123786a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 123886a27549SJunchao Zhang PetscFunctionReturn(0); 123986a27549SJunchao Zhang } 124086a27549SJunchao Zhang 124186a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */ 124286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 124386a27549SJunchao Zhang { 124486a27549SJunchao Zhang PetscErrorCode ierr; 124586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 124686a27549SJunchao Zhang PetscScalarKokkosView xv; 124786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 124886a27549SJunchao Zhang 124986a27549SJunchao Zhang PetscFunctionBegin; 125086a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 125186a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 125286a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 125386a27549SJunchao Zhang /* Solve U^T tmpv = b */ 125486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 125586a27549SJunchao Zhang 125686a27549SJunchao Zhang /* Solve L^T x = tmpv */ 125786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 125886a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 125986a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 126086a27549SJunchao Zhang PetscFunctionReturn(0); 126186a27549SJunchao Zhang } 126286a27549SJunchao Zhang 126386a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 126486a27549SJunchao Zhang { 126586a27549SJunchao Zhang PetscErrorCode ierr; 126686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 126786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 126886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 126986a27549SJunchao Zhang 127086a27549SJunchao Zhang PetscFunctionBegin; 127186a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 127286a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 127386a27549SJunchao Zhang 127486a27549SJunchao Zhang B->assembled = PETSC_TRUE; 127586a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 127686a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 127786a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 127886a27549SJunchao Zhang B->ops->matsolve = NULL; 127986a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 128086a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 128186a27549SJunchao Zhang 128286a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 128386a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 128486a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 128586a27549SJunchao Zhang PetscFunctionReturn(0); 128686a27549SJunchao Zhang } 128786a27549SJunchao Zhang 128886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 128986a27549SJunchao Zhang { 129086a27549SJunchao Zhang PetscErrorCode ierr; 129186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 129286a27549SJunchao Zhang Mat_SeqAIJ *b; 129386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 129486a27549SJunchao Zhang PetscInt fill_lev = info->levels; 129586a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 129686a27549SJunchao Zhang PetscInt n = A->rmap->n; 129786a27549SJunchao Zhang 129886a27549SJunchao Zhang PetscFunctionBegin; 129986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 130086a27549SJunchao Zhang /* Rebuild factors */ 130186a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 130286a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 130386a27549SJunchao Zhang 130486a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 130586a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 130686a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 130786a27549SJunchao Zhang 130886a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 130986a27549SJunchao Zhang 131086a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 131186a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 131286a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 131386a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 131486a27549SJunchao Zhang 131586a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 131686a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 131786a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 131886a27549SJunchao Zhang 131986a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 132086a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 132186a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 132286a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 132386a27549SJunchao Zhang 132486a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 132586a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 132686a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 132786a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 132886a27549SJunchao Zhang #else 132986a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 133086a27549SJunchao Zhang #endif 133186a27549SJunchao Zhang 133286a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 133386a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 133486a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 133586a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 133686a27549SJunchao Zhang 133786a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 133886a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 133986a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 134086a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 134186a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 134286a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 134386a27549SJunchao Zhang 134486a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 134586a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1346930e68a5SMark Adams PetscFunctionReturn(0); 1347930e68a5SMark Adams } 1348930e68a5SMark Adams 13498f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 13508f7e8f9dSMark Adams { 13518f7e8f9dSMark Adams PetscErrorCode ierr; 13528f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 13538f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1354930e68a5SMark Adams 13558f7e8f9dSMark Adams PetscFunctionBegin; 13568f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 13578f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 13588f7e8f9dSMark Adams // move B data into Kokkos 13598f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 13608f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 13618f7e8f9dSMark Adams { 13628f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 13638f7e8f9dSMark Adams if (!baijkok->diag_d) { 13648f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1365152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 13668f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 13678f7e8f9dSMark Adams } 13688f7e8f9dSMark Adams } 13698f7e8f9dSMark Adams PetscFunctionReturn(0); 13708f7e8f9dSMark Adams } 13718f7e8f9dSMark Adams 137286a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1373930e68a5SMark Adams { 1374930e68a5SMark Adams PetscFunctionBegin; 1375930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1376930e68a5SMark Adams PetscFunctionReturn(0); 1377930e68a5SMark Adams } 1378930e68a5SMark Adams 13798f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice 13808f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 13818f7e8f9dSMark Adams { 13828f7e8f9dSMark Adams PetscFunctionBegin; 13838f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 13848f7e8f9dSMark Adams PetscFunctionReturn(0); 13858f7e8f9dSMark Adams } 13868f7e8f9dSMark Adams 1387930e68a5SMark Adams /*MC 138886a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 138986a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1390930e68a5SMark Adams 1391930e68a5SMark Adams Level: beginner 1392930e68a5SMark Adams 139386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1394930e68a5SMark Adams M*/ 139586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1396930e68a5SMark Adams { 1397930e68a5SMark Adams PetscErrorCode ierr; 1398930e68a5SMark Adams PetscInt n = A->rmap->n; 1399930e68a5SMark Adams 1400930e68a5SMark Adams PetscFunctionBegin; 1401930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1402930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1403930e68a5SMark Adams (*B)->factortype = ftype; 14044ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1405930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1406930e68a5SMark Adams 14078f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1408930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 140986a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 141086a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 141186a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 141286a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 141386a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 141486a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 141586a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1416930e68a5SMark Adams 1417930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 141886a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1419930e68a5SMark Adams PetscFunctionReturn(0); 1420930e68a5SMark Adams } 14218f7e8f9dSMark Adams 14228f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 14238f7e8f9dSMark Adams { 14248f7e8f9dSMark Adams PetscErrorCode ierr; 14258f7e8f9dSMark Adams PetscInt n = A->rmap->n; 14268f7e8f9dSMark Adams 14278f7e8f9dSMark Adams PetscFunctionBegin; 14288f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 14298f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 14308f7e8f9dSMark Adams (*B)->factortype = ftype; 1431f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 14324ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 14338f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 14348f7e8f9dSMark Adams 14358f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 14368f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 14378f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 14388f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 14398f7e8f9dSMark Adams 14408f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 14418f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 14428f7e8f9dSMark Adams PetscFunctionReturn(0); 14438f7e8f9dSMark Adams } 144486a27549SJunchao Zhang 144586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 144686a27549SJunchao Zhang { 144786a27549SJunchao Zhang PetscErrorCode ierr; 144886a27549SJunchao Zhang 144986a27549SJunchao Zhang PetscFunctionBegin; 145086a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 145186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 145286a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 145386a27549SJunchao Zhang PetscFunctionReturn(0); 145486a27549SJunchao Zhang } 145586a27549SJunchao Zhang 1456