111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 38c3ff71bSJunchao Zhang #include <petscsystypes.h> 48c3ff71bSJunchao Zhang #include <petscerror.h> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 7f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1086a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1186a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 1286a27549SJunchao Zhang 138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 148c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 158c3ff71bSJunchao Zhang 168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 178c3ff71bSJunchao Zhang 188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 198c3ff71bSJunchao Zhang { 208c3ff71bSJunchao Zhang PetscErrorCode ierr; 21a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 228c3ff71bSJunchao Zhang 238c3ff71bSJunchao Zhang PetscFunctionBegin; 248c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 2586a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 26a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 27a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 28a587d139SMark } 298c3ff71bSJunchao Zhang /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 308c3ff71bSJunchao Zhang PetscFunctionReturn(0); 318c3ff71bSJunchao Zhang } 328c3ff71bSJunchao Zhang 3386a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 348c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 358c3ff71bSJunchao Zhang { 36152b3e56SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 378c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388c3ff71bSJunchao Zhang PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 398c3ff71bSJunchao Zhang 408c3ff71bSJunchao Zhang PetscFunctionBegin; 4186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 42152b3e56SJunchao Zhang /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 438c3ff71bSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 448c3ff71bSJunchao Zhang delete aijkok; 458c3ff71bSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 468c3ff71bSJunchao Zhang aijkok->nonzerostate = A->nonzerostate; 478c3ff71bSJunchao Zhang A->spptr = aijkok; 488c3ff71bSJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 4986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 50152b3e56SJunchao Zhang aijkok->a_dual.modify_host(); /* Mark host is modified */ 51152b3e56SJunchao Zhang aijkok->a_dual.sync_device(); /* Sync the device */ 5286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 5386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 548c3ff71bSJunchao Zhang } 558c3ff71bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 568c3ff71bSJunchao Zhang PetscFunctionReturn(0); 578c3ff71bSJunchao Zhang } 588c3ff71bSJunchao Zhang 5986a27549SJunchao Zhang /* Mark the CSR data on device is modified */ 6086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A) 6186a27549SJunchao Zhang { 6286a27549SJunchao Zhang PetscErrorCode ierr; 6386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 6486a27549SJunchao Zhang 6586a27549SJunchao Zhang PetscFunctionBegin; 6686a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 6786a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 6886a27549SJunchao Zhang aijkok->a_dual.modify_device(); 6986a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 7086a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 7186a27549SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_GPU; 7286a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 7386a27549SJunchao Zhang PetscFunctionReturn(0); 7486a27549SJunchao Zhang } 7586a27549SJunchao Zhang 76f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 77f0cf5187SStefano Zampini { 78f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 79f0cf5187SStefano Zampini 80f0cf5187SStefano Zampini PetscFunctionBegin; 81f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 8286a27549SJunchao Zhang /* We do not expect one needs factors on host */ 8386a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 84f0cf5187SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 85f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 8686a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8786a27549SJunchao Zhang aijkok->a_dual.modify_device(); /* Mark device is modified */ 8886a27549SJunchao Zhang aijkok->a_dual.sync_host(); /* Sync the host */ 89f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 90f0cf5187SStefano Zampini } 91f0cf5187SStefano Zampini PetscFunctionReturn(0); 92f0cf5187SStefano Zampini } 93f0cf5187SStefano Zampini 94f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 95f0cf5187SStefano Zampini { 96f0cf5187SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 97f0cf5187SStefano Zampini PetscErrorCode ierr; 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini PetscFunctionBegin; 100f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 101f0cf5187SStefano Zampini *array = a->a; 102f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 103f0cf5187SStefano Zampini PetscFunctionReturn(0); 104f0cf5187SStefano Zampini } 105f0cf5187SStefano Zampini 106a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 107*042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 108a587d139SMark { 109*042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 110*042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 111a587d139SMark 112a587d139SMark PetscFunctionBegin; 113a587d139SMark if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 114152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 115a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 116a587d139SMark PetscFunctionReturn(0); 117a587d139SMark } 118a587d139SMark 119a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 120*042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 121a587d139SMark { 122*042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 123a587d139SMark 124a587d139SMark PetscFunctionBegin; 125a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 126a587d139SMark *d_mat = aijkok->device_mat_d.data(); 127a587d139SMark } else { 128a587d139SMark PetscErrorCode ierr; 129a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 130a587d139SMark *d_mat = NULL; 131a587d139SMark } 132a587d139SMark PetscFunctionReturn(0); 133a587d139SMark } 134152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 135152b3e56SJunchao Zhang { 136152b3e56SJunchao Zhang PetscErrorCode ierr; 137152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 138152b3e56SJunchao Zhang 139152b3e56SJunchao Zhang PetscFunctionBegin; 140152b3e56SJunchao Zhang if (!aijkok->At) { /* Generate At for the first time */ 141152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 142152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 14386a27549SJunchao Zhang } else if (!aijkok->transpose_updated) { /* Only update At values */ 144152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 145152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 146152b3e56SJunchao Zhang } 14786a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 148152b3e56SJunchao Zhang PetscFunctionReturn(0); 149152b3e56SJunchao Zhang } 150152b3e56SJunchao Zhang 15186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A) 152152b3e56SJunchao Zhang { 153152b3e56SJunchao Zhang PetscErrorCode ierr; 154152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 155152b3e56SJunchao Zhang 156152b3e56SJunchao Zhang PetscFunctionBegin; 157152b3e56SJunchao Zhang if (!aijkok->Ah) { /* Generate Ah for the first time */ 158152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 159152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 160152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 16186a27549SJunchao Zhang } else if (!aijkok->hermitian_updated) { /* Only update Ah values */ 162152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 163152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 164152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 165152b3e56SJunchao Zhang } 16686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 167152b3e56SJunchao Zhang PetscFunctionReturn(0); 168152b3e56SJunchao Zhang } 169a587d139SMark 1708c3ff71bSJunchao Zhang /* y = A x */ 1718c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1728c3ff71bSJunchao Zhang { 1738c3ff71bSJunchao Zhang PetscErrorCode ierr; 1748c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 175152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 176152b3e56SJunchao Zhang PetscScalarKokkosView yv; 1778c3ff71bSJunchao Zhang 1788c3ff71bSJunchao Zhang PetscFunctionBegin; 1798c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 180152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 181152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 1828c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 183152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 184152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 185152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 186bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 187152b3e56SJunchao Zhang /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 188152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 1898c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1908c3ff71bSJunchao Zhang } 1918c3ff71bSJunchao Zhang 1928c3ff71bSJunchao Zhang /* y = A^T x */ 1938c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1948c3ff71bSJunchao Zhang { 1958c3ff71bSJunchao Zhang PetscErrorCode ierr; 1968c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 197152b3e56SJunchao Zhang Mat B; 198152b3e56SJunchao Zhang const char *mode; 199152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 200152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2018c3ff71bSJunchao Zhang 2028c3ff71bSJunchao Zhang PetscFunctionBegin; 2038c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 204152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 205152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 206152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 207152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 208152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 209152b3e56SJunchao Zhang mode = "N"; 210152b3e56SJunchao Zhang } else { 211152b3e56SJunchao Zhang B = A; 212152b3e56SJunchao Zhang mode = "T"; 213152b3e56SJunchao Zhang } 214152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 215152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 216152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 217152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 218bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 219152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2208c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2218c3ff71bSJunchao Zhang } 2228c3ff71bSJunchao Zhang 2238c3ff71bSJunchao Zhang /* y = A^H x */ 2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2258c3ff71bSJunchao Zhang { 2268c3ff71bSJunchao Zhang PetscErrorCode ierr; 2278c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 228152b3e56SJunchao Zhang Mat B; 229152b3e56SJunchao Zhang const char *mode; 230152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 231152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2328c3ff71bSJunchao Zhang 2338c3ff71bSJunchao Zhang PetscFunctionBegin; 2348c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 235152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 236152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 237152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 23886a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 239152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 240152b3e56SJunchao Zhang mode = "N"; 241152b3e56SJunchao Zhang } else { 242152b3e56SJunchao Zhang B = A; 243152b3e56SJunchao Zhang mode = "C"; 244152b3e56SJunchao Zhang } 245152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 246152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 247152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 248152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 249bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 250152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2528c3ff71bSJunchao Zhang } 2538c3ff71bSJunchao Zhang 2548c3ff71bSJunchao Zhang /* z = A x + y */ 2558c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 2568c3ff71bSJunchao Zhang { 2578c3ff71bSJunchao Zhang PetscErrorCode ierr; 2588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 259152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 260152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2618c3ff71bSJunchao Zhang 2628c3ff71bSJunchao Zhang PetscFunctionBegin; 2638c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 264152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 265152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 266152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2678c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2688c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 270152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 271152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 272152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 273bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 274152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2758c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2768c3ff71bSJunchao Zhang } 2778c3ff71bSJunchao Zhang 2788c3ff71bSJunchao Zhang /* z = A^T x + y */ 2798c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2808c3ff71bSJunchao Zhang { 2818c3ff71bSJunchao Zhang PetscErrorCode ierr; 2828c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 283152b3e56SJunchao Zhang Mat B; 284152b3e56SJunchao Zhang const char *mode; 285152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 286152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2878c3ff71bSJunchao Zhang 2888c3ff71bSJunchao Zhang PetscFunctionBegin; 2898c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 290152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 291152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 292152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2938c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 294152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 295152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 296152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 297152b3e56SJunchao Zhang mode = "N"; 298152b3e56SJunchao Zhang } else { 299152b3e56SJunchao Zhang B = A; 300152b3e56SJunchao Zhang mode = "T"; 301152b3e56SJunchao Zhang } 302152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 303152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 304152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 305152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 306152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 307bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 308152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3098c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3108c3ff71bSJunchao Zhang } 3118c3ff71bSJunchao Zhang 3128c3ff71bSJunchao Zhang /* z = A^H x + y */ 3138c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3148c3ff71bSJunchao Zhang { 3158c3ff71bSJunchao Zhang PetscErrorCode ierr; 3168c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 317152b3e56SJunchao Zhang Mat B; 318152b3e56SJunchao Zhang const char *mode; 319152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 320152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3218c3ff71bSJunchao Zhang 3228c3ff71bSJunchao Zhang PetscFunctionBegin; 3238c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 324152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 325152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 326152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3278c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 328152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 32986a27549SJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr); 330152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 331152b3e56SJunchao Zhang mode = "N"; 332152b3e56SJunchao Zhang } else { 333152b3e56SJunchao Zhang B = A; 334152b3e56SJunchao Zhang mode = "C"; 335152b3e56SJunchao Zhang } 336152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 337152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 338152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 340152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 341bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 342152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 343152b3e56SJunchao Zhang PetscFunctionReturn(0); 344152b3e56SJunchao Zhang } 345152b3e56SJunchao Zhang 346152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 347152b3e56SJunchao Zhang { 348152b3e56SJunchao Zhang PetscErrorCode ierr; 349152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 350152b3e56SJunchao Zhang 351152b3e56SJunchao Zhang PetscFunctionBegin; 352152b3e56SJunchao Zhang switch (op) { 353152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 354152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 355152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 356152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 357152b3e56SJunchao Zhang break; 358152b3e56SJunchao Zhang default: 359152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 360152b3e56SJunchao Zhang break; 361152b3e56SJunchao Zhang } 3628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3638c3ff71bSJunchao Zhang } 3648c3ff71bSJunchao Zhang 3653d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3668c3ff71bSJunchao Zhang { 3678c3ff71bSJunchao Zhang PetscErrorCode ierr; 3688c3ff71bSJunchao Zhang Mat B; 369c58ef05eSStefano Zampini Mat_SeqAIJ *aij; 3708c3ff71bSJunchao Zhang 3718c3ff71bSJunchao Zhang PetscFunctionBegin; 372a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 3738c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 3748c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3758c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 3768c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3778c3ff71bSJunchao Zhang } 3788c3ff71bSJunchao Zhang 3798c3ff71bSJunchao Zhang B = *newmat; 3808c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 38186a27549SJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 38286a27549SJunchao Zhang 3838c3ff71bSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 3840d8bce8aSStefano Zampini ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 385f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 386c58ef05eSStefano Zampini /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 387c58ef05eSStefano Zampini aij = (Mat_SeqAIJ*)B->data; 388c58ef05eSStefano Zampini aij->inode.use = PETSC_FALSE; 3898c3ff71bSJunchao Zhang 3908c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3918c3ff71bSJunchao Zhang } 3928c3ff71bSJunchao Zhang 3938c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 3948c3ff71bSJunchao Zhang { 3958c3ff71bSJunchao Zhang PetscErrorCode ierr; 3968c3ff71bSJunchao Zhang 3978c3ff71bSJunchao Zhang PetscFunctionBegin; 3988c3ff71bSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3998c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 4008c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4018c3ff71bSJunchao Zhang } 4028c3ff71bSJunchao Zhang 4038c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 4048c3ff71bSJunchao Zhang { 4058c3ff71bSJunchao Zhang PetscErrorCode ierr; 40686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 4078c3ff71bSJunchao Zhang 4088c3ff71bSJunchao Zhang PetscFunctionBegin; 40986a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 41086a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 4118f7e8f9dSMark Adams if (aijkok) { 4128f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 413a587d139SMark delete aijkok->colmap_d; 414a587d139SMark delete aijkok->i_uncompressed_d; 415a587d139SMark } 4168f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 4178f7e8f9dSMark Adams } 4188c3ff71bSJunchao Zhang delete aijkok; 41986a27549SJunchao Zhang } else { 42086a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 42186a27549SJunchao Zhang } 422f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 423152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 424152b3e56SJunchao Zhang A->spptr = NULL; 4258c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 4268c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4278c3ff71bSJunchao Zhang } 4288c3ff71bSJunchao Zhang 42986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 43086a27549SJunchao Zhang { 43186a27549SJunchao Zhang PetscErrorCode ierr; 43286a27549SJunchao Zhang 43386a27549SJunchao Zhang PetscFunctionBegin; 43486a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 43586a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 43686a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 43786a27549SJunchao Zhang PetscFunctionReturn(0); 43886a27549SJunchao Zhang } 43986a27549SJunchao Zhang 440a3f881fbSStefano Zampini #if 0 441a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 442a3f881fbSStefano Zampini { 443a3f881fbSStefano Zampini MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 444a3f881fbSStefano Zampini 445a3f881fbSStefano Zampini PetscFunctionBegin; 446a3f881fbSStefano Zampini delete kh; 447a3f881fbSStefano Zampini PetscFunctionReturn(0); 448a3f881fbSStefano Zampini } 449a3f881fbSStefano Zampini 450a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 451a3f881fbSStefano Zampini { 452a3f881fbSStefano Zampini Mat_Product *product = C->product; 453a3f881fbSStefano Zampini Mat A,B; 454a3f881fbSStefano Zampini MatProductType ptype; 455a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 456a3f881fbSStefano Zampini bool tA,tB; 457a3f881fbSStefano Zampini PetscErrorCode ierr; 458a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 459a3f881fbSStefano Zampini Mat_SeqAIJ *c; 460a3f881fbSStefano Zampini 461a3f881fbSStefano Zampini PetscFunctionBegin; 462a3f881fbSStefano Zampini MatCheckProduct(C,1); 463a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 464a3f881fbSStefano Zampini A = product->A; 465a3f881fbSStefano Zampini B = product->B; 466a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 467a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 468a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 469a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 470a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 471a3f881fbSStefano Zampini kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 472a3f881fbSStefano Zampini ptype = product->type; 473a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 474a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 475a3f881fbSStefano Zampini switch (ptype) { 476a3f881fbSStefano Zampini case MATPRODUCT_AB: 477a3f881fbSStefano Zampini tA = false; 478a3f881fbSStefano Zampini tB = false; 479a3f881fbSStefano Zampini break; 480a3f881fbSStefano Zampini case MATPRODUCT_AtB: 481a3f881fbSStefano Zampini tA = true; 482a3f881fbSStefano Zampini tB = false; 483a3f881fbSStefano Zampini break; 484a3f881fbSStefano Zampini case MATPRODUCT_ABt: 485a3f881fbSStefano Zampini tA = false; 486a3f881fbSStefano Zampini tB = true; 487a3f881fbSStefano Zampini break; 488a3f881fbSStefano Zampini default: 489a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 490a3f881fbSStefano Zampini } 491a3f881fbSStefano Zampini 492a3f881fbSStefano Zampini KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 493a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 494a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 495a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 496a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 497a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 498a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 499a3f881fbSStefano Zampini c->reallocs = 0; 500a3f881fbSStefano Zampini C->info.mallocs += 0; 501a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 502a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 503a3f881fbSStefano Zampini C->num_ass++; 504a3f881fbSStefano Zampini /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 505a3f881fbSStefano Zampini // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 506a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_BOTH; 507a3f881fbSStefano Zampini // Also, we should add support to copy back from device to host 508a3f881fbSStefano Zampini PetscFunctionReturn(0); 509a3f881fbSStefano Zampini } 510a3f881fbSStefano Zampini 511a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 512a3f881fbSStefano Zampini { 513a3f881fbSStefano Zampini Mat_Product *product = C->product; 514a3f881fbSStefano Zampini Mat A,B; 515a3f881fbSStefano Zampini MatProductType ptype; 516a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 517a3f881fbSStefano Zampini PetscInt m,n,k; 518a3f881fbSStefano Zampini bool tA,tB; 519a3f881fbSStefano Zampini PetscErrorCode ierr; 520a3f881fbSStefano Zampini Mat_SeqAIJ *c; 521a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 522a3f881fbSStefano Zampini 523a3f881fbSStefano Zampini PetscFunctionBegin; 524a3f881fbSStefano Zampini MatCheckProduct(C,1); 525a3f881fbSStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 526a3f881fbSStefano Zampini A = product->A; 527a3f881fbSStefano Zampini B = product->B; 528a3f881fbSStefano Zampini // TODO only copy the i,j data, not the values 529a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 530a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 531a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 532a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 533a3f881fbSStefano Zampini ptype = product->type; 534a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 535a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 536a3f881fbSStefano Zampini switch (ptype) { 537a3f881fbSStefano Zampini case MATPRODUCT_AB: 538a3f881fbSStefano Zampini tA = false; 539a3f881fbSStefano Zampini tB = false; 540a3f881fbSStefano Zampini m = A->rmap->n; 541a3f881fbSStefano Zampini n = B->cmap->n; 542a3f881fbSStefano Zampini break; 543a3f881fbSStefano Zampini case MATPRODUCT_AtB: 544a3f881fbSStefano Zampini tA = true; 545a3f881fbSStefano Zampini tB = false; 546a3f881fbSStefano Zampini m = A->cmap->n; 547a3f881fbSStefano Zampini n = B->cmap->n; 548a3f881fbSStefano Zampini break; 549a3f881fbSStefano Zampini case MATPRODUCT_ABt: 550a3f881fbSStefano Zampini tA = false; 551a3f881fbSStefano Zampini tB = true; 552a3f881fbSStefano Zampini m = A->rmap->n; 553a3f881fbSStefano Zampini n = B->rmap->n; 554a3f881fbSStefano Zampini break; 555a3f881fbSStefano Zampini default: 556a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 557a3f881fbSStefano Zampini } 558a3f881fbSStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 559a3f881fbSStefano Zampini ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 560a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 561a3f881fbSStefano Zampini 562a3f881fbSStefano Zampini kh = new MatMatKernelHandle_t; 563a3f881fbSStefano Zampini // TODO SZ: ADD RUNTIME SELECTION OF THESE 564a3f881fbSStefano Zampini kh->set_team_work_size(16); 565a3f881fbSStefano Zampini kh->set_dynamic_scheduling(true); 566a3f881fbSStefano Zampini // Select an spgemm algorithm, limited by configuration at compile-time and 567a3f881fbSStefano Zampini // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 568a3f881fbSStefano Zampini // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 569a3f881fbSStefano Zampini std::string myalg("SPGEMM_KK_MEMORY"); 570a3f881fbSStefano Zampini kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 571a3f881fbSStefano Zampini 572a3f881fbSStefano Zampini // TODO JZ 573a3f881fbSStefano Zampini ckok = NULL; //new Mat_SeqAIJKokkos(); 574a3f881fbSStefano Zampini C->spptr = ckok; 575a3f881fbSStefano Zampini KokkosCsrMatrix_t ccsr; // here only to have the code compile 576a3f881fbSStefano Zampini KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 577*042217e8SBarry Smith 578a3f881fbSStefano Zampini c->singlemalloc = PETSC_FALSE; 579a3f881fbSStefano Zampini c->free_a = PETSC_TRUE; 580a3f881fbSStefano Zampini c->free_ij = PETSC_TRUE; 581a3f881fbSStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 582a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 583a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 584*042217e8SBarry Smith 585a3f881fbSStefano Zampini // TODO JZ copy from device to c->i and c->j 586*042217e8SBarry Smith 587a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 588a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 589a3f881fbSStefano Zampini c->maxnz = c->nz; 590a3f881fbSStefano Zampini c->nonzerorowcnt = 0; 591a3f881fbSStefano Zampini c->rmax = 0; 592a3f881fbSStefano Zampini for (k = 0; k < m; k++) { 593a3f881fbSStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 594a3f881fbSStefano Zampini c->ilen[k] = c->imax[k] = nn; 595a3f881fbSStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 596a3f881fbSStefano Zampini c->rmax = PetscMax(c->rmax,nn); 597a3f881fbSStefano Zampini } 598a3f881fbSStefano Zampini 599a3f881fbSStefano Zampini C->nonzerostate++; 600a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 601a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 602a3f881fbSStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 603a3f881fbSStefano Zampini ckok->nonzerostate = C->nonzerostate; 604a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 605a3f881fbSStefano Zampini C->preallocated = PETSC_TRUE; 606a3f881fbSStefano Zampini C->assembled = PETSC_FALSE; 607a3f881fbSStefano Zampini C->was_assembled = PETSC_FALSE; 608a3f881fbSStefano Zampini 609a3f881fbSStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 610a3f881fbSStefano Zampini C->product->data = kh; 611a3f881fbSStefano Zampini C->product->destroy = MatMatKernelHandleDestroy_Private; 612a3f881fbSStefano Zampini PetscFunctionReturn(0); 613a3f881fbSStefano Zampini } 614a3f881fbSStefano Zampini 615a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 616a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 617a3f881fbSStefano Zampini { 618a3f881fbSStefano Zampini Mat_Product *product = mat->product; 619a3f881fbSStefano Zampini PetscErrorCode ierr; 620a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 621a3f881fbSStefano Zampini 622a3f881fbSStefano Zampini PetscFunctionBegin; 623a3f881fbSStefano Zampini MatCheckProduct(mat,1); 624a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 625a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 626a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 627a3f881fbSStefano Zampini } 628a3f881fbSStefano Zampini if (Biskok && Ciskok) { 629a3f881fbSStefano Zampini switch (product->type) { 630a3f881fbSStefano Zampini case MATPRODUCT_AB: 631a3f881fbSStefano Zampini case MATPRODUCT_AtB: 632a3f881fbSStefano Zampini case MATPRODUCT_ABt: 633a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 634a3f881fbSStefano Zampini break; 635a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 636a3f881fbSStefano Zampini case MATPRODUCT_RARt: 637a3f881fbSStefano Zampini case MATPRODUCT_ABC: 638a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 639a3f881fbSStefano Zampini break; 640a3f881fbSStefano Zampini default: 641a3f881fbSStefano Zampini break; 642a3f881fbSStefano Zampini } 643a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 644a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 645a3f881fbSStefano Zampini } 646a3f881fbSStefano Zampini PetscFunctionReturn(0); 647a3f881fbSStefano Zampini } 648a3f881fbSStefano Zampini #endif 649a3f881fbSStefano Zampini 650a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 651a587d139SMark { 652a587d139SMark PetscErrorCode ierr; 653a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 654a587d139SMark PetscFunctionBegin; 65586a27549SJunchao Zhang if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 656a587d139SMark ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 657a587d139SMark PetscFunctionReturn(0); 658a587d139SMark } 659a587d139SMark 660f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 661f0cf5187SStefano Zampini { 662f0cf5187SStefano Zampini PetscErrorCode ierr; 663f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 664f0cf5187SStefano Zampini 665f0cf5187SStefano Zampini PetscFunctionBegin; 666f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 667f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 668f0cf5187SStefano Zampini KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 669f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 670f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 671f0cf5187SStefano Zampini ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 672f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 673f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 674f0cf5187SStefano Zampini PetscFunctionReturn(0); 675f0cf5187SStefano Zampini } 676f0cf5187SStefano Zampini 677a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 678a587d139SMark { 679a587d139SMark PetscErrorCode ierr; 680a587d139SMark PetscBool both = PETSC_FALSE; 681a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 682a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683a587d139SMark 684a587d139SMark PetscFunctionBegin; 685a587d139SMark if (aijkok && aijkok->a_d.data()) { 686f0cf5187SStefano Zampini KokkosBlas::fill(aijkok->a_d,0.0); 687a587d139SMark both = PETSC_TRUE; 688a587d139SMark } 689a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 690a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 691a587d139SMark if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 692a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 693a587d139SMark PetscFunctionReturn(0); 694a587d139SMark } 695a587d139SMark 696db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 697db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 698db78de30SJunchao Zhang { 699db78de30SJunchao Zhang PetscErrorCode ierr; 700db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 701db78de30SJunchao Zhang 702db78de30SJunchao Zhang PetscFunctionBegin; 703db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 704db78de30SJunchao Zhang PetscValidPointer(kv,2); 705db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 706db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708db78de30SJunchao Zhang *kv = aijkok->a_d; 709db78de30SJunchao Zhang PetscFunctionReturn(0); 710db78de30SJunchao Zhang } 711db78de30SJunchao Zhang 712db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 713db78de30SJunchao Zhang { 714db78de30SJunchao Zhang PetscFunctionBegin; 715db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 716db78de30SJunchao Zhang PetscValidPointer(kv,2); 717db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 718db78de30SJunchao Zhang PetscFunctionReturn(0); 719db78de30SJunchao Zhang } 720db78de30SJunchao Zhang 721db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 722db78de30SJunchao Zhang { 723db78de30SJunchao Zhang PetscErrorCode ierr; 724db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 725db78de30SJunchao Zhang 726db78de30SJunchao Zhang PetscFunctionBegin; 727db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 728db78de30SJunchao Zhang PetscValidPointer(kv,2); 729db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 730db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 731db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 732db78de30SJunchao Zhang *kv = aijkok->a_d; 733db78de30SJunchao Zhang PetscFunctionReturn(0); 734db78de30SJunchao Zhang } 735db78de30SJunchao Zhang 736db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 737db78de30SJunchao Zhang { 738db78de30SJunchao Zhang PetscErrorCode ierr; 739db78de30SJunchao Zhang 740db78de30SJunchao Zhang PetscFunctionBegin; 741db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 742db78de30SJunchao Zhang PetscValidPointer(kv,2); 743db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 744db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 745db78de30SJunchao Zhang PetscFunctionReturn(0); 746db78de30SJunchao Zhang } 747db78de30SJunchao Zhang 748db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 749db78de30SJunchao Zhang { 750db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 751db78de30SJunchao Zhang 752db78de30SJunchao Zhang PetscFunctionBegin; 753db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 754db78de30SJunchao Zhang PetscValidPointer(kv,2); 755db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 756db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 757db78de30SJunchao Zhang *kv = aijkok->a_d; 758db78de30SJunchao Zhang PetscFunctionReturn(0); 759db78de30SJunchao Zhang } 760db78de30SJunchao Zhang 761db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 762db78de30SJunchao Zhang { 763db78de30SJunchao Zhang PetscErrorCode ierr; 764db78de30SJunchao Zhang 765db78de30SJunchao Zhang PetscFunctionBegin; 766db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 767db78de30SJunchao Zhang PetscValidPointer(kv,2); 768db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 769db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr); 770db78de30SJunchao Zhang PetscFunctionReturn(0); 771db78de30SJunchao Zhang } 772db78de30SJunchao Zhang 773db78de30SJunchao Zhang /* Computes Y += a*X */ 774f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 775a587d139SMark { 776a587d139SMark PetscErrorCode ierr; 777a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 778a587d139SMark 779a587d139SMark PetscFunctionBegin; 780f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 781a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 782a587d139SMark PetscFunctionReturn(0); 783a587d139SMark } 784db78de30SJunchao Zhang 785f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 786a587d139SMark PetscBool e; 787a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 788a587d139SMark if (e) { 789a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 790f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 791a587d139SMark } 792a587d139SMark } 793db78de30SJunchao Zhang 794a587d139SMark if (str != SAME_NONZERO_PATTERN) { 795db78de30SJunchao Zhang /* TODO: do MatAXPY on device */ 796a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 797a587d139SMark PetscFunctionReturn(0); 798a587d139SMark } else { 799db78de30SJunchao Zhang ConstPetscScalarKokkosView xv; 800db78de30SJunchao Zhang PetscScalarKokkosView yv; 801db78de30SJunchao Zhang 802db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 803db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 804db78de30SJunchao Zhang KokkosBlas::axpy(a,xv,yv); 805db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 806db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 807a587d139SMark } 808a587d139SMark PetscFunctionReturn(0); 809a587d139SMark } 810a587d139SMark 81186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 8128f7e8f9dSMark Adams { 8138f7e8f9dSMark Adams PetscErrorCode ierr; 8148f7e8f9dSMark Adams 8158f7e8f9dSMark Adams PetscFunctionBegin; 8168f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 8178f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 8188f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 8198f7e8f9dSMark Adams PetscFunctionReturn(0); 8208f7e8f9dSMark Adams } 8218f7e8f9dSMark Adams 8228c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 8238c3ff71bSJunchao Zhang { 8248c3ff71bSJunchao Zhang PetscFunctionBegin; 8256f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 8266f3d89d0SStefano Zampini 827a587d139SMark A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 8288c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 8298c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 8308c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 831a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 832f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 833a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 834a3f881fbSStefano Zampini //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 8358c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 8368c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 8378c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 8388c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 8398c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 8408c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 841152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 8428c3ff71bSJunchao Zhang PetscFunctionReturn(0); 8438c3ff71bSJunchao Zhang } 8448c3ff71bSJunchao Zhang 8458c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 846152b3e56SJunchao Zhang /*@C 8478c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 8488c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 8498c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 8508c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 8518c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 8528c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 8538c3ff71bSJunchao Zhang 8548c3ff71bSJunchao Zhang Collective 8558c3ff71bSJunchao Zhang 8568c3ff71bSJunchao Zhang Input Parameters: 8578c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 8588c3ff71bSJunchao Zhang . m - number of rows 8598c3ff71bSJunchao Zhang . n - number of columns 8608c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 8618c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 8628c3ff71bSJunchao Zhang (possibly different for each row) or NULL 8638c3ff71bSJunchao Zhang 8648c3ff71bSJunchao Zhang Output Parameter: 8658c3ff71bSJunchao Zhang . A - the matrix 8668c3ff71bSJunchao Zhang 8678c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 8688c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 8698c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 8708c3ff71bSJunchao Zhang 8718c3ff71bSJunchao Zhang Notes: 8728c3ff71bSJunchao Zhang If nnz is given then nz is ignored 8738c3ff71bSJunchao Zhang 8748c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 8758c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 8768c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 8778c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 8788c3ff71bSJunchao Zhang 8798c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 8808c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 8818c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 8828c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 8838c3ff71bSJunchao Zhang 8848c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 8858c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 8868c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 8878c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 8888c3ff71bSJunchao Zhang 8898c3ff71bSJunchao Zhang Level: intermediate 8908c3ff71bSJunchao Zhang 8918c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 8928c3ff71bSJunchao Zhang @*/ 8938c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 8948c3ff71bSJunchao Zhang { 8958c3ff71bSJunchao Zhang PetscErrorCode ierr; 8968c3ff71bSJunchao Zhang 8978c3ff71bSJunchao Zhang PetscFunctionBegin; 8988c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 8998c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 9008c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 9018c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 9028c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 9038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 9048c3ff71bSJunchao Zhang } 905930e68a5SMark Adams 9068f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 9078f7e8f9dSMark Adams // 9088f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 9098f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 9108f7e8f9dSMark Adams // 9118f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 912930e68a5SMark Adams { 9138f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 9148f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 9158f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 916930e68a5SMark Adams PetscErrorCode ierr; 9178f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 9188f7e8f9dSMark Adams const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 9198f7e8f9dSMark Adams const PetscScalar *aa_d = aijkok->a_d.data(); 9208f7e8f9dSMark Adams PetscScalar *ba_d = baijkok->a_d.data(); 9218f7e8f9dSMark Adams PetscBool row_identity,col_identity; 9228f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 9238f7e8f9dSMark Adams PetscContainer container; 924930e68a5SMark Adams 925930e68a5SMark Adams PetscFunctionBegin; 9268f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 9278f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 9288f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 9298f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 9308f7e8f9dSMark Adams if (container) { 9318f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 9328f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 9338f7e8f9dSMark Adams Nf = (*pNf)%1000; 9348f7e8f9dSMark Adams nv = (*pNf)/1000; 9358f7e8f9dSMark Adams if (nv>0) nVec = nv; 9368f7e8f9dSMark Adams } else Nf = 1; 9378f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 9388f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 9398f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 9408f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 9418f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 9428f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 9438f7e8f9dSMark Adams #endif 9448f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 9458f7e8f9dSMark Adams { 9468f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 9478f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 9488f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 9498f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 9508f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 9518f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 9528f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 9538f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 9548f7e8f9dSMark Adams size_t flops_h = 0.0; 9558f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 9568f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 9578f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 9588f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 9598f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 9608f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 9618f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 9628f7e8f9dSMark Adams // Fill A --> fact 9638f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 964*042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 9658f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 9668f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 9678f7e8f9dSMark Adams // zero rows of B 9688f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9698f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 9708f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 9718f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 9728f7e8f9dSMark Adams /* zero (unfactored row) */ 9738f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 9748f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 9758f7e8f9dSMark Adams }); 9768f7e8f9dSMark Adams // copy A into B 9778f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 9788f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 9798f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 9808f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 9818f7e8f9dSMark Adams /* load in initial (unfactored row) */ 9828f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 9838f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 9848f7e8f9dSMark Adams PetscScalar vala = av[j]; 9858f7e8f9dSMark Adams if (colb == rowb) { 9868f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 9878f7e8f9dSMark Adams } else { 9888f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9898f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 9908f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 9918f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 9928f7e8f9dSMark Adams if (pbj[j] == colb) { 9938f7e8f9dSMark Adams pba[j] = vala; 9948f7e8f9dSMark Adams set++; 9958f7e8f9dSMark Adams break; 9968f7e8f9dSMark Adams } 9978f7e8f9dSMark Adams } 9988f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 9998f7e8f9dSMark Adams } 10008f7e8f9dSMark Adams } 10018f7e8f9dSMark Adams }); 10028f7e8f9dSMark Adams }); 10038f7e8f9dSMark Adams Kokkos::fence(); 1004930e68a5SMark Adams 10058f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 10068f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 10078f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 10088f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1009*042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 10108f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 10118f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 10128f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 10138f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 10148f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 10158f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 10168f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 10178f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 10188f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 10198f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 10208f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 10218f7e8f9dSMark Adams else { 10228f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 10238f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 10248f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 10258f7e8f9dSMark Adams size_t st_idx; 10268f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 10278f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 10288f7e8f9dSMark Adams // get column, there has got to be a better way 10298f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 10308f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 10318f7e8f9dSMark Adams if (pjL[j] == ii) { 10328f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 10338f7e8f9dSMark Adams idx = j; // output 10348f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 10358f7e8f9dSMark Adams } 10368f7e8f9dSMark Adams }, st_idx); 10378f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 10388f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 10398f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 10408f7e8f9dSMark Adams // U(i+1,:end) 10418f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 10428f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 10438f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 10448f7e8f9dSMark Adams if (col==myk) { 10458f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 10468f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 10478f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 10488f7e8f9dSMark Adams } else { 10498f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 10508f7e8f9dSMark Adams PetscInt high, low; 10518f7e8f9dSMark Adams const PetscInt *startj; 10528f7e8f9dSMark Adams if (col<myk) { // L 10538f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 10548f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 10558f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 10568f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 10578f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 10588f7e8f9dSMark Adams } else { 10598f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 10608f7e8f9dSMark Adams start = ba_d + idx; 10618f7e8f9dSMark Adams startj= bj_d + idx; 10628f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 10638f7e8f9dSMark Adams } 10648f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 10658f7e8f9dSMark Adams low = 0; 10668f7e8f9dSMark Adams high = (PetscInt)(end-start); 10678f7e8f9dSMark Adams while (high-low > 5) { 10688f7e8f9dSMark Adams int t = (low+high)/2; 10698f7e8f9dSMark Adams if (startj[t] > col) high = t; 10708f7e8f9dSMark Adams else low = t; 10718f7e8f9dSMark Adams } 10728f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 10738f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 10748f7e8f9dSMark Adams } 10758f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 10768f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 10778f7e8f9dSMark Adams } 10788f7e8f9dSMark Adams }); 10798f7e8f9dSMark Adams } 10808f7e8f9dSMark Adams } 10818f7e8f9dSMark Adams }); 10828f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 10838f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 10848f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 10858f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 10868f7e8f9dSMark Adams }); 10878f7e8f9dSMark Adams Kokkos::fence(); 10888f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 10898f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 10908f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 10918f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 10928f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 10938f7e8f9dSMark Adams #endif 10948f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 10958f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 10968f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 10978f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 10988f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 10998f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 11008f7e8f9dSMark Adams if (i < end) { 11018f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 11028f7e8f9dSMark Adams *pv = 1.0/(*pv); 11038f7e8f9dSMark Adams } 11048f7e8f9dSMark Adams }); 11058f7e8f9dSMark Adams }); 11068f7e8f9dSMark Adams } 11078f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 11088f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 11098f7e8f9dSMark Adams #endif 11108f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 11118f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 11128f7e8f9dSMark Adams 11138f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 11148f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 11158f7e8f9dSMark Adams if (b->inode.size) { 11168f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 11178f7e8f9dSMark Adams } else if (row_identity && col_identity) { 11188f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 11198f7e8f9dSMark Adams } else { 11208f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 11218f7e8f9dSMark Adams } 11228f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 11238f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 11248f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 11258f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 11268f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 11278f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 11288f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 11298f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 11308f7e8f9dSMark Adams 1131930e68a5SMark Adams PetscFunctionReturn(0); 1132930e68a5SMark Adams } 1133930e68a5SMark Adams 113486a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1135930e68a5SMark Adams { 1136930e68a5SMark Adams PetscErrorCode ierr; 1137930e68a5SMark Adams 1138930e68a5SMark Adams PetscFunctionBegin; 1139930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 114086a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 114186a27549SJunchao Zhang PetscFunctionReturn(0); 114286a27549SJunchao Zhang } 114386a27549SJunchao Zhang 114486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 114586a27549SJunchao Zhang { 114686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 114786a27549SJunchao Zhang 114886a27549SJunchao Zhang PetscFunctionBegin; 114986a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 115086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 115186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 115286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 115386a27549SJunchao Zhang } 115486a27549SJunchao Zhang PetscFunctionReturn(0); 115586a27549SJunchao Zhang } 115686a27549SJunchao Zhang 115786a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 115886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 115986a27549SJunchao Zhang { 116086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 116186a27549SJunchao Zhang MatColumnIndexType n = A->rmap->n; 116286a27549SJunchao Zhang 116386a27549SJunchao Zhang PetscFunctionBegin; 116486a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 116586a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 116686a27549SJunchao Zhang factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1); 116786a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 116886a27549SJunchao Zhang factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 116986a27549SJunchao Zhang factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 117086a27549SJunchao Zhang 117186a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 117286a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 117386a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 117486a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 117586a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 117686a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 117786a27549SJunchao Zhang 117886a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 117986a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 118086a27549SJunchao Zhang */ 118186a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 118286a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 118386a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 118486a27549SJunchao Zhang 118586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 118686a27549SJunchao Zhang 118786a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 118886a27549SJunchao Zhang factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1); 118986a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 119086a27549SJunchao Zhang factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 119186a27549SJunchao Zhang factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 119286a27549SJunchao Zhang 119386a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 119486a27549SJunchao Zhang ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView, 119586a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView, 119686a27549SJunchao Zhang MatRowOffsetKokkosView,DefaultExecutionSpace>( 119786a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 119886a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 119986a27549SJunchao Zhang 120086a27549SJunchao Zhang /* Sort indices. See comments above */ 120186a27549SJunchao Zhang KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace, 120286a27549SJunchao Zhang MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>( 120386a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 120486a27549SJunchao Zhang 120586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 120686a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 120786a27549SJunchao Zhang } 120886a27549SJunchao Zhang PetscFunctionReturn(0); 120986a27549SJunchao Zhang } 121086a27549SJunchao Zhang 121186a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 121286a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 121386a27549SJunchao Zhang { 121486a27549SJunchao Zhang PetscErrorCode ierr; 121586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 121686a27549SJunchao Zhang PetscScalarKokkosView xv; 121786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 121886a27549SJunchao Zhang 121986a27549SJunchao Zhang PetscFunctionBegin; 122086a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 122186a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 122286a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 122386a27549SJunchao Zhang /* Solve L tmpv = b */ 122486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector); 122586a27549SJunchao Zhang /* Solve Ux = tmpv */ 122686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv); 122786a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 122886a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 122986a27549SJunchao Zhang PetscFunctionReturn(0); 123086a27549SJunchao Zhang } 123186a27549SJunchao Zhang 123286a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */ 123386a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 123486a27549SJunchao Zhang { 123586a27549SJunchao Zhang PetscErrorCode ierr; 123686a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 123786a27549SJunchao Zhang PetscScalarKokkosView xv; 123886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 123986a27549SJunchao Zhang 124086a27549SJunchao Zhang PetscFunctionBegin; 124186a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 124286a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 124386a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 124486a27549SJunchao Zhang /* Solve U^T tmpv = b */ 124586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 124686a27549SJunchao Zhang 124786a27549SJunchao Zhang /* Solve L^T x = tmpv */ 124886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 124986a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 125086a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 125186a27549SJunchao Zhang PetscFunctionReturn(0); 125286a27549SJunchao Zhang } 125386a27549SJunchao Zhang 125486a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 125586a27549SJunchao Zhang { 125686a27549SJunchao Zhang PetscErrorCode ierr; 125786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 125886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 125986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 126086a27549SJunchao Zhang 126186a27549SJunchao Zhang PetscFunctionBegin; 126286a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 126386a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 126486a27549SJunchao Zhang 126586a27549SJunchao Zhang B->assembled = PETSC_TRUE; 126686a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 126786a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 126886a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 126986a27549SJunchao Zhang B->ops->matsolve = NULL; 127086a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 127186a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 127286a27549SJunchao Zhang 127386a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 127486a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 127586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 127686a27549SJunchao Zhang PetscFunctionReturn(0); 127786a27549SJunchao Zhang } 127886a27549SJunchao Zhang 127986a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 128086a27549SJunchao Zhang { 128186a27549SJunchao Zhang PetscErrorCode ierr; 128286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 128386a27549SJunchao Zhang Mat_SeqAIJ *b; 128486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 128586a27549SJunchao Zhang PetscInt fill_lev = info->levels; 128686a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 128786a27549SJunchao Zhang PetscInt n = A->rmap->n; 128886a27549SJunchao Zhang 128986a27549SJunchao Zhang PetscFunctionBegin; 129086a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 129186a27549SJunchao Zhang /* Rebuild factors */ 129286a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 129386a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 129486a27549SJunchao Zhang 129586a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 129686a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 129786a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 129886a27549SJunchao Zhang 129986a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 130086a27549SJunchao Zhang 130186a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 130286a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 130386a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 130486a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 130586a27549SJunchao Zhang 130686a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 130786a27549SJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 130886a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 130986a27549SJunchao Zhang 131086a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 131186a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 131286a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 131386a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 131486a27549SJunchao Zhang 131586a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 131686a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 131786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 131886a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 131986a27549SJunchao Zhang #else 132086a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 132186a27549SJunchao Zhang #endif 132286a27549SJunchao Zhang 132386a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 132486a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 132586a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 132686a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 132786a27549SJunchao Zhang 132886a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 132986a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 133086a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 133186a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 133286a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 133386a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 133486a27549SJunchao Zhang 133586a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 133686a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1337930e68a5SMark Adams PetscFunctionReturn(0); 1338930e68a5SMark Adams } 1339930e68a5SMark Adams 13408f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 13418f7e8f9dSMark Adams { 13428f7e8f9dSMark Adams PetscErrorCode ierr; 13438f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 13448f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1345930e68a5SMark Adams 13468f7e8f9dSMark Adams PetscFunctionBegin; 13478f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 13488f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 13498f7e8f9dSMark Adams // move B data into Kokkos 13508f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 13518f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 13528f7e8f9dSMark Adams { 13538f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 13548f7e8f9dSMark Adams if (!baijkok->diag_d) { 13558f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1356152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 13578f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 13588f7e8f9dSMark Adams } 13598f7e8f9dSMark Adams } 13608f7e8f9dSMark Adams PetscFunctionReturn(0); 13618f7e8f9dSMark Adams } 13628f7e8f9dSMark Adams 136386a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1364930e68a5SMark Adams { 1365930e68a5SMark Adams PetscFunctionBegin; 1366930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1367930e68a5SMark Adams PetscFunctionReturn(0); 1368930e68a5SMark Adams } 1369930e68a5SMark Adams 13708f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 13718f7e8f9dSMark Adams { 13728f7e8f9dSMark Adams PetscFunctionBegin; 13738f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 13748f7e8f9dSMark Adams PetscFunctionReturn(0); 13758f7e8f9dSMark Adams } 13768f7e8f9dSMark Adams 1377930e68a5SMark Adams /*MC 137886a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 137986a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1380930e68a5SMark Adams 1381930e68a5SMark Adams Level: beginner 1382930e68a5SMark Adams 138386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1384930e68a5SMark Adams M*/ 138586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1386930e68a5SMark Adams { 1387930e68a5SMark Adams PetscErrorCode ierr; 1388930e68a5SMark Adams PetscInt n = A->rmap->n; 1389930e68a5SMark Adams 1390930e68a5SMark Adams PetscFunctionBegin; 1391930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1392930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1393930e68a5SMark Adams (*B)->factortype = ftype; 13944ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1395930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1396930e68a5SMark Adams 13978f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1398930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 139986a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 140086a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 140186a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 140286a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 140386a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 140486a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 140586a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1406930e68a5SMark Adams 1407930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 140886a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1409930e68a5SMark Adams PetscFunctionReturn(0); 1410930e68a5SMark Adams } 14118f7e8f9dSMark Adams 14128f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 14138f7e8f9dSMark Adams { 14148f7e8f9dSMark Adams PetscErrorCode ierr; 14158f7e8f9dSMark Adams PetscInt n = A->rmap->n; 14168f7e8f9dSMark Adams 14178f7e8f9dSMark Adams PetscFunctionBegin; 14188f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 14198f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 14208f7e8f9dSMark Adams (*B)->factortype = ftype; 1421f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 14224ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 14238f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 14248f7e8f9dSMark Adams 14258f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 14268f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 14278f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 14288f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 14298f7e8f9dSMark Adams 14308f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 14318f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 14328f7e8f9dSMark Adams PetscFunctionReturn(0); 14338f7e8f9dSMark Adams } 143486a27549SJunchao Zhang 143586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 143686a27549SJunchao Zhang { 143786a27549SJunchao Zhang PetscErrorCode ierr; 143886a27549SJunchao Zhang 143986a27549SJunchao Zhang PetscFunctionBegin; 144086a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 144186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 144286a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 144386a27549SJunchao Zhang PetscFunctionReturn(0); 144486a27549SJunchao Zhang } 144586a27549SJunchao Zhang 1446