1152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 28c3ff71bSJunchao Zhang #include <petscsystypes.h> 38c3ff71bSJunchao Zhang #include <petscerror.h> 4152b3e56SJunchao Zhang #include <petscvec_kokkos.hpp> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 7f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 108c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 118c3ff71bSJunchao Zhang 128c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 13a587d139SMark #include <petscmat.h> 148c3ff71bSJunchao Zhang 158c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 168c3ff71bSJunchao Zhang 178c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 188c3ff71bSJunchao Zhang { 198c3ff71bSJunchao Zhang PetscErrorCode ierr; 20a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 218c3ff71bSJunchao Zhang 228c3ff71bSJunchao Zhang PetscFunctionBegin; 238c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 24a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 25a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 26a587d139SMark } 278c3ff71bSJunchao Zhang /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 288c3ff71bSJunchao Zhang PetscFunctionReturn(0); 298c3ff71bSJunchao Zhang } 308c3ff71bSJunchao Zhang 318c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 328c3ff71bSJunchao Zhang { 33152b3e56SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ*>(A->data); 348c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 358c3ff71bSJunchao Zhang PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 368c3ff71bSJunchao Zhang 378c3ff71bSJunchao Zhang PetscFunctionBegin; 38152b3e56SJunchao Zhang /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */ 398c3ff71bSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 408c3ff71bSJunchao Zhang delete aijkok; 418c3ff71bSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 428c3ff71bSJunchao Zhang aijkok->nonzerostate = A->nonzerostate; 438c3ff71bSJunchao Zhang A->spptr = aijkok; 448c3ff71bSJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 45152b3e56SJunchao Zhang // Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 46152b3e56SJunchao Zhang aijkok->a_dual.modify_host(); /* Mark host is modified */ 47152b3e56SJunchao Zhang aijkok->a_dual.sync_device(); /* Sync the device */ 48152b3e56SJunchao Zhang aijkok->need_sync_At_values = PETSC_TRUE; /* Mark matT and matH need to sync, but don't sync them now */ 49152b3e56SJunchao Zhang aijkok->need_sync_Ah_values = PETSC_TRUE; 508c3ff71bSJunchao Zhang } 518c3ff71bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 55f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 56f0cf5187SStefano Zampini { 57f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58f0cf5187SStefano Zampini 59f0cf5187SStefano Zampini PetscFunctionBegin; 60f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 61f0cf5187SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 62f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 63f0cf5187SStefano Zampini Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 64f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 65f0cf5187SStefano Zampini } 66f0cf5187SStefano Zampini PetscFunctionReturn(0); 67f0cf5187SStefano Zampini } 68f0cf5187SStefano Zampini 69f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 70f0cf5187SStefano Zampini { 71f0cf5187SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 72f0cf5187SStefano Zampini PetscErrorCode ierr; 73f0cf5187SStefano Zampini 74f0cf5187SStefano Zampini PetscFunctionBegin; 75f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 76f0cf5187SStefano Zampini *array = a->a; 77f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 78f0cf5187SStefano Zampini PetscFunctionReturn(0); 79f0cf5187SStefano Zampini } 80f0cf5187SStefano Zampini 81a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 82a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 83a587d139SMark { 84a587d139SMark Mat_SeqAIJKokkos *aijkok; 85a587d139SMark Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 86a587d139SMark 87a587d139SMark PetscFunctionBegin; 88a587d139SMark // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 89a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 90a587d139SMark if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 91152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 92a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 93a587d139SMark PetscFunctionReturn(0); 94a587d139SMark } 95a587d139SMark 96a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 97a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 98a587d139SMark { 99a587d139SMark Mat_SeqAIJKokkos *aijkok; 100a587d139SMark 101a587d139SMark PetscFunctionBegin; 102a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 103a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 104a587d139SMark *d_mat = aijkok->device_mat_d.data(); 105a587d139SMark } else { 106a587d139SMark PetscErrorCode ierr; 107a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 108a587d139SMark *d_mat = NULL; 109a587d139SMark } 110a587d139SMark PetscFunctionReturn(0); 111a587d139SMark } 112152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A) 113152b3e56SJunchao Zhang { 114152b3e56SJunchao Zhang PetscErrorCode ierr; 115152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116152b3e56SJunchao Zhang 117152b3e56SJunchao Zhang PetscFunctionBegin; 118152b3e56SJunchao Zhang if (!aijkok->At) { /* Generate At for the first time */ 119152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr); 120152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 121152b3e56SJunchao Zhang } else if (aijkok->need_sync_At_values) { /* Only update At values */ 122152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr); 123152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr); 124152b3e56SJunchao Zhang } 125152b3e56SJunchao Zhang aijkok->need_sync_At_values = PETSC_FALSE; 126152b3e56SJunchao Zhang PetscFunctionReturn(0); 127152b3e56SJunchao Zhang } 128152b3e56SJunchao Zhang 129152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeHermitian(Mat A) 130152b3e56SJunchao Zhang { 131152b3e56SJunchao Zhang PetscErrorCode ierr; 132152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 133152b3e56SJunchao Zhang 134152b3e56SJunchao Zhang PetscFunctionBegin; 135152b3e56SJunchao Zhang if (!aijkok->Ah) { /* Generate Ah for the first time */ 136152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 137152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 138152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 139152b3e56SJunchao Zhang } else if (aijkok->need_sync_Ah_values) { /* Only update Ah values */ 140152b3e56SJunchao Zhang ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr); 141152b3e56SJunchao Zhang ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr); 142152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr); 143152b3e56SJunchao Zhang } 144152b3e56SJunchao Zhang aijkok->need_sync_Ah_values = PETSC_FALSE; 145152b3e56SJunchao Zhang PetscFunctionReturn(0); 146152b3e56SJunchao Zhang } 147a587d139SMark 1488c3ff71bSJunchao Zhang /* y = A x */ 1498c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1508c3ff71bSJunchao Zhang { 1518c3ff71bSJunchao Zhang PetscErrorCode ierr; 1528c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 153152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 154152b3e56SJunchao Zhang PetscScalarKokkosView yv; 1558c3ff71bSJunchao Zhang 1568c3ff71bSJunchao Zhang PetscFunctionBegin; 1578c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 158152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 159152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 1608c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 161152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 162152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 163152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 164bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 165152b3e56SJunchao Zhang /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 166152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 1678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1688c3ff71bSJunchao Zhang } 1698c3ff71bSJunchao Zhang 1708c3ff71bSJunchao Zhang /* y = A^T x */ 1718c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1728c3ff71bSJunchao Zhang { 1738c3ff71bSJunchao Zhang PetscErrorCode ierr; 1748c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 175152b3e56SJunchao Zhang Mat B; 176152b3e56SJunchao Zhang const char *mode; 177152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 178152b3e56SJunchao Zhang PetscScalarKokkosView yv; 1798c3ff71bSJunchao Zhang 1808c3ff71bSJunchao Zhang PetscFunctionBegin; 1818c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 182152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 183152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 184152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 185152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 186152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 187152b3e56SJunchao Zhang mode = "N"; 188152b3e56SJunchao Zhang } else { 189152b3e56SJunchao Zhang B = A; 190152b3e56SJunchao Zhang mode = "T"; 191152b3e56SJunchao Zhang } 192152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 193152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 194152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 195152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 196bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 197152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 1988c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1998c3ff71bSJunchao Zhang } 2008c3ff71bSJunchao Zhang 2018c3ff71bSJunchao Zhang /* y = A^H x */ 2028c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2038c3ff71bSJunchao Zhang { 2048c3ff71bSJunchao Zhang PetscErrorCode ierr; 2058c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 206152b3e56SJunchao Zhang Mat B; 207152b3e56SJunchao Zhang const char *mode; 208152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 209152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2108c3ff71bSJunchao Zhang 2118c3ff71bSJunchao Zhang PetscFunctionBegin; 2128c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 213152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 214152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 215152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 216152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr); 217152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 218152b3e56SJunchao Zhang mode = "N"; 219152b3e56SJunchao Zhang } else { 220152b3e56SJunchao Zhang B = A; 221152b3e56SJunchao Zhang mode = "C"; 222152b3e56SJunchao Zhang } 223152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 224152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 225152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 226152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 227bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 228152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2298c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2308c3ff71bSJunchao Zhang } 2318c3ff71bSJunchao Zhang 2328c3ff71bSJunchao Zhang /* z = A x + y */ 2338c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 2348c3ff71bSJunchao Zhang { 2358c3ff71bSJunchao Zhang PetscErrorCode ierr; 2368c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 237152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 238152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2398c3ff71bSJunchao Zhang 2408c3ff71bSJunchao Zhang PetscFunctionBegin; 2418c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 242152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 243152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 244152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2458c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2468c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 247152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 248152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 249152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 250152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 251bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 252152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2538c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2548c3ff71bSJunchao Zhang } 2558c3ff71bSJunchao Zhang 2568c3ff71bSJunchao Zhang /* z = A^T x + y */ 2578c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2588c3ff71bSJunchao Zhang { 2598c3ff71bSJunchao Zhang PetscErrorCode ierr; 2608c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 261152b3e56SJunchao Zhang Mat B; 262152b3e56SJunchao Zhang const char *mode; 263152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 264152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2658c3ff71bSJunchao Zhang 2668c3ff71bSJunchao Zhang PetscFunctionBegin; 2678c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 268152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 269152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 270152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 2718c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 272152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 273152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr); 274152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At; 275152b3e56SJunchao Zhang mode = "N"; 276152b3e56SJunchao Zhang } else { 277152b3e56SJunchao Zhang B = A; 278152b3e56SJunchao Zhang mode = "T"; 279152b3e56SJunchao Zhang } 280152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 281152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 282152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 283152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 284152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 285bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 286152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2878c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2888c3ff71bSJunchao Zhang } 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang /* z = A^H x + y */ 2918c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2928c3ff71bSJunchao Zhang { 2938c3ff71bSJunchao Zhang PetscErrorCode ierr; 2948c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 295152b3e56SJunchao Zhang Mat B; 296152b3e56SJunchao Zhang const char *mode; 297152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 298152b3e56SJunchao Zhang PetscScalarKokkosView zv; 2998c3ff71bSJunchao Zhang 3008c3ff71bSJunchao Zhang PetscFunctionBegin; 3018c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 302152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 303152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 304152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3058c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 306152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 307152b3e56SJunchao Zhang ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr); 308152b3e56SJunchao Zhang B = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah; 309152b3e56SJunchao Zhang mode = "N"; 310152b3e56SJunchao Zhang } else { 311152b3e56SJunchao Zhang B = A; 312152b3e56SJunchao Zhang mode = "C"; 313152b3e56SJunchao Zhang } 314152b3e56SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 315152b3e56SJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 316152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 317152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 318152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 319bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 320152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 321152b3e56SJunchao Zhang PetscFunctionReturn(0); 322152b3e56SJunchao Zhang } 323152b3e56SJunchao Zhang 324152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 325152b3e56SJunchao Zhang { 326152b3e56SJunchao Zhang PetscErrorCode ierr; 327152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 328152b3e56SJunchao Zhang 329152b3e56SJunchao Zhang PetscFunctionBegin; 330152b3e56SJunchao Zhang switch (op) { 331152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 332152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 333152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 334152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 335152b3e56SJunchao Zhang break; 336152b3e56SJunchao Zhang default: 337152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 338152b3e56SJunchao Zhang break; 339152b3e56SJunchao Zhang } 3408c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3418c3ff71bSJunchao Zhang } 3428c3ff71bSJunchao Zhang 3433d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3448c3ff71bSJunchao Zhang { 3458c3ff71bSJunchao Zhang PetscErrorCode ierr; 3468c3ff71bSJunchao Zhang Mat B; 347c58ef05eSStefano Zampini Mat_SeqAIJ *aij; 3488c3ff71bSJunchao Zhang 3498c3ff71bSJunchao Zhang PetscFunctionBegin; 350a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 3518c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 3528c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3538c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 3548c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3558c3ff71bSJunchao Zhang } 3568c3ff71bSJunchao Zhang 3578c3ff71bSJunchao Zhang B = *newmat; 3588c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3598c3ff71bSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 3608c3ff71bSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 3610d8bce8aSStefano Zampini ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 362f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 363c58ef05eSStefano Zampini /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 364c58ef05eSStefano Zampini aij = (Mat_SeqAIJ*)B->data; 365c58ef05eSStefano Zampini aij->inode.use = PETSC_FALSE; 3668c3ff71bSJunchao Zhang 3678c3ff71bSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 3688c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3698c3ff71bSJunchao Zhang } 3708c3ff71bSJunchao Zhang 3718c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 3728c3ff71bSJunchao Zhang { 3738c3ff71bSJunchao Zhang PetscErrorCode ierr; 3748c3ff71bSJunchao Zhang 3758c3ff71bSJunchao Zhang PetscFunctionBegin; 3768c3ff71bSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3778c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3788c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3798c3ff71bSJunchao Zhang } 3808c3ff71bSJunchao Zhang 3818c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 3828c3ff71bSJunchao Zhang { 3838c3ff71bSJunchao Zhang PetscErrorCode ierr; 3848c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 3858c3ff71bSJunchao Zhang 3868c3ff71bSJunchao Zhang PetscFunctionBegin; 3878f7e8f9dSMark Adams if (aijkok) { 3888f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 389a587d139SMark delete aijkok->colmap_d; 390a587d139SMark delete aijkok->i_uncompressed_d; 391a587d139SMark } 3928f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 3938f7e8f9dSMark Adams } 3948c3ff71bSJunchao Zhang delete aijkok; 395f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 396152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 397152b3e56SJunchao Zhang A->spptr = NULL; 3988c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3998c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4008c3ff71bSJunchao Zhang } 4018c3ff71bSJunchao Zhang 402a3f881fbSStefano Zampini #if 0 403a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 404a3f881fbSStefano Zampini { 405a3f881fbSStefano Zampini MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 406a3f881fbSStefano Zampini 407a3f881fbSStefano Zampini PetscFunctionBegin; 408a3f881fbSStefano Zampini delete kh; 409a3f881fbSStefano Zampini PetscFunctionReturn(0); 410a3f881fbSStefano Zampini } 411a3f881fbSStefano Zampini 412a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 413a3f881fbSStefano Zampini { 414a3f881fbSStefano Zampini Mat_Product *product = C->product; 415a3f881fbSStefano Zampini Mat A,B; 416a3f881fbSStefano Zampini MatProductType ptype; 417a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 418a3f881fbSStefano Zampini bool tA,tB; 419a3f881fbSStefano Zampini PetscErrorCode ierr; 420a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 421a3f881fbSStefano Zampini Mat_SeqAIJ *c; 422a3f881fbSStefano Zampini 423a3f881fbSStefano Zampini PetscFunctionBegin; 424a3f881fbSStefano Zampini MatCheckProduct(C,1); 425a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 426a3f881fbSStefano Zampini A = product->A; 427a3f881fbSStefano Zampini B = product->B; 428a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 429a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 430a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 431a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 432a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 433a3f881fbSStefano Zampini kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 434a3f881fbSStefano Zampini ptype = product->type; 435a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 436a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 437a3f881fbSStefano Zampini switch (ptype) { 438a3f881fbSStefano Zampini case MATPRODUCT_AB: 439a3f881fbSStefano Zampini tA = false; 440a3f881fbSStefano Zampini tB = false; 441a3f881fbSStefano Zampini break; 442a3f881fbSStefano Zampini case MATPRODUCT_AtB: 443a3f881fbSStefano Zampini tA = true; 444a3f881fbSStefano Zampini tB = false; 445a3f881fbSStefano Zampini break; 446a3f881fbSStefano Zampini case MATPRODUCT_ABt: 447a3f881fbSStefano Zampini tA = false; 448a3f881fbSStefano Zampini tB = true; 449a3f881fbSStefano Zampini break; 450a3f881fbSStefano Zampini default: 451a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 452a3f881fbSStefano Zampini } 453a3f881fbSStefano Zampini 454a3f881fbSStefano Zampini KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 455a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 456a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 457a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 458a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 459a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 460a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 461a3f881fbSStefano Zampini c->reallocs = 0; 462a3f881fbSStefano Zampini C->info.mallocs += 0; 463a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 464a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 465a3f881fbSStefano Zampini C->num_ass++; 466a3f881fbSStefano Zampini /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 467a3f881fbSStefano Zampini // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 468a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_BOTH; 469a3f881fbSStefano Zampini // Also, we should add support to copy back from device to host 470a3f881fbSStefano Zampini PetscFunctionReturn(0); 471a3f881fbSStefano Zampini } 472a3f881fbSStefano Zampini 473a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 474a3f881fbSStefano Zampini { 475a3f881fbSStefano Zampini Mat_Product *product = C->product; 476a3f881fbSStefano Zampini Mat A,B; 477a3f881fbSStefano Zampini MatProductType ptype; 478a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 479a3f881fbSStefano Zampini PetscInt m,n,k; 480a3f881fbSStefano Zampini bool tA,tB; 481a3f881fbSStefano Zampini PetscErrorCode ierr; 482a3f881fbSStefano Zampini Mat_SeqAIJ *c; 483a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 484a3f881fbSStefano Zampini 485a3f881fbSStefano Zampini PetscFunctionBegin; 486a3f881fbSStefano Zampini MatCheckProduct(C,1); 487a3f881fbSStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 488a3f881fbSStefano Zampini A = product->A; 489a3f881fbSStefano Zampini B = product->B; 490a3f881fbSStefano Zampini // TODO only copy the i,j data, not the values 491a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 492a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 493a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 494a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 495a3f881fbSStefano Zampini ptype = product->type; 496a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 497a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 498a3f881fbSStefano Zampini switch (ptype) { 499a3f881fbSStefano Zampini case MATPRODUCT_AB: 500a3f881fbSStefano Zampini tA = false; 501a3f881fbSStefano Zampini tB = false; 502a3f881fbSStefano Zampini m = A->rmap->n; 503a3f881fbSStefano Zampini n = B->cmap->n; 504a3f881fbSStefano Zampini break; 505a3f881fbSStefano Zampini case MATPRODUCT_AtB: 506a3f881fbSStefano Zampini tA = true; 507a3f881fbSStefano Zampini tB = false; 508a3f881fbSStefano Zampini m = A->cmap->n; 509a3f881fbSStefano Zampini n = B->cmap->n; 510a3f881fbSStefano Zampini break; 511a3f881fbSStefano Zampini case MATPRODUCT_ABt: 512a3f881fbSStefano Zampini tA = false; 513a3f881fbSStefano Zampini tB = true; 514a3f881fbSStefano Zampini m = A->rmap->n; 515a3f881fbSStefano Zampini n = B->rmap->n; 516a3f881fbSStefano Zampini break; 517a3f881fbSStefano Zampini default: 518a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 519a3f881fbSStefano Zampini } 520a3f881fbSStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 521a3f881fbSStefano Zampini ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 522a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 523a3f881fbSStefano Zampini 524a3f881fbSStefano Zampini kh = new MatMatKernelHandle_t; 525a3f881fbSStefano Zampini // TODO SZ: ADD RUNTIME SELECTION OF THESE 526a3f881fbSStefano Zampini kh->set_team_work_size(16); 527a3f881fbSStefano Zampini kh->set_dynamic_scheduling(true); 528a3f881fbSStefano Zampini // Select an spgemm algorithm, limited by configuration at compile-time and 529a3f881fbSStefano Zampini // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 530a3f881fbSStefano Zampini // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 531a3f881fbSStefano Zampini std::string myalg("SPGEMM_KK_MEMORY"); 532a3f881fbSStefano Zampini kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 533a3f881fbSStefano Zampini 534a3f881fbSStefano Zampini ///////////////////////////////////// 535a3f881fbSStefano Zampini // TODO JZ 536a3f881fbSStefano Zampini ckok = NULL; //new Mat_SeqAIJKokkos(); 537a3f881fbSStefano Zampini C->spptr = ckok; 538a3f881fbSStefano Zampini KokkosCsrMatrix_t ccsr; // here only to have the code compile 539a3f881fbSStefano Zampini KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 540a3f881fbSStefano Zampini //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 541a3f881fbSStefano Zampini //c->nz = get_nnz_from_ccsr 542a3f881fbSStefano Zampini ////////////////////////////////////// 543a3f881fbSStefano Zampini c->singlemalloc = PETSC_FALSE; 544a3f881fbSStefano Zampini c->free_a = PETSC_TRUE; 545a3f881fbSStefano Zampini c->free_ij = PETSC_TRUE; 546a3f881fbSStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 547a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 548a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 549a3f881fbSStefano Zampini //////////////////////////////////// 550a3f881fbSStefano Zampini // TODO JZ copy from device to c->i and c->j 551a3f881fbSStefano Zampini //////////////////////////////////// 552a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 553a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 554a3f881fbSStefano Zampini c->maxnz = c->nz; 555a3f881fbSStefano Zampini c->nonzerorowcnt = 0; 556a3f881fbSStefano Zampini c->rmax = 0; 557a3f881fbSStefano Zampini for (k = 0; k < m; k++) { 558a3f881fbSStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 559a3f881fbSStefano Zampini c->ilen[k] = c->imax[k] = nn; 560a3f881fbSStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 561a3f881fbSStefano Zampini c->rmax = PetscMax(c->rmax,nn); 562a3f881fbSStefano Zampini } 563a3f881fbSStefano Zampini 564a3f881fbSStefano Zampini C->nonzerostate++; 565a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 566a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 567a3f881fbSStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 568a3f881fbSStefano Zampini ckok->nonzerostate = C->nonzerostate; 569a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 570a3f881fbSStefano Zampini C->preallocated = PETSC_TRUE; 571a3f881fbSStefano Zampini C->assembled = PETSC_FALSE; 572a3f881fbSStefano Zampini C->was_assembled = PETSC_FALSE; 573a3f881fbSStefano Zampini 574a3f881fbSStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 575a3f881fbSStefano Zampini C->product->data = kh; 576a3f881fbSStefano Zampini C->product->destroy = MatMatKernelHandleDestroy_Private; 577a3f881fbSStefano Zampini PetscFunctionReturn(0); 578a3f881fbSStefano Zampini } 579a3f881fbSStefano Zampini 580a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 581a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 582a3f881fbSStefano Zampini { 583a3f881fbSStefano Zampini Mat_Product *product = mat->product; 584a3f881fbSStefano Zampini PetscErrorCode ierr; 585a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 586a3f881fbSStefano Zampini 587a3f881fbSStefano Zampini PetscFunctionBegin; 588a3f881fbSStefano Zampini MatCheckProduct(mat,1); 589a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 590a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 591a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 592a3f881fbSStefano Zampini } 593a3f881fbSStefano Zampini if (Biskok && Ciskok) { 594a3f881fbSStefano Zampini switch (product->type) { 595a3f881fbSStefano Zampini case MATPRODUCT_AB: 596a3f881fbSStefano Zampini case MATPRODUCT_AtB: 597a3f881fbSStefano Zampini case MATPRODUCT_ABt: 598a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 599a3f881fbSStefano Zampini break; 600a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 601a3f881fbSStefano Zampini case MATPRODUCT_RARt: 602a3f881fbSStefano Zampini case MATPRODUCT_ABC: 603a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 604a3f881fbSStefano Zampini break; 605a3f881fbSStefano Zampini default: 606a3f881fbSStefano Zampini break; 607a3f881fbSStefano Zampini } 608a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 609a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 610a3f881fbSStefano Zampini } 611a3f881fbSStefano Zampini PetscFunctionReturn(0); 612a3f881fbSStefano Zampini } 613a3f881fbSStefano Zampini #endif 614a3f881fbSStefano Zampini 6158c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 6168c3ff71bSJunchao Zhang { 6178c3ff71bSJunchao Zhang PetscErrorCode ierr; 6188c3ff71bSJunchao Zhang 6198c3ff71bSJunchao Zhang PetscFunctionBegin; 6208c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 6218c3ff71bSJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 6228c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 6238c3ff71bSJunchao Zhang PetscFunctionReturn(0); 6248c3ff71bSJunchao Zhang } 6258c3ff71bSJunchao Zhang 626a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 627a587d139SMark { 628a587d139SMark PetscErrorCode ierr; 629a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 630a587d139SMark PetscFunctionBegin; 631a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 632a587d139SMark SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 633a587d139SMark } 634a587d139SMark ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 635a587d139SMark PetscFunctionReturn(0); 636a587d139SMark } 637a587d139SMark 638f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 639f0cf5187SStefano Zampini { 640f0cf5187SStefano Zampini PetscErrorCode ierr; 641f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 642f0cf5187SStefano Zampini 643f0cf5187SStefano Zampini PetscFunctionBegin; 644f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 645f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 646f0cf5187SStefano Zampini KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 647f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 648f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 649f0cf5187SStefano Zampini ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 650f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 651f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 652f0cf5187SStefano Zampini PetscFunctionReturn(0); 653f0cf5187SStefano Zampini } 654f0cf5187SStefano Zampini 655a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 656a587d139SMark { 657a587d139SMark PetscErrorCode ierr; 658a587d139SMark PetscBool both = PETSC_FALSE; 659a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 660a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 661a587d139SMark 662a587d139SMark PetscFunctionBegin; 663a587d139SMark if (aijkok && aijkok->a_d.data()) { 664f0cf5187SStefano Zampini KokkosBlas::fill(aijkok->a_d,0.0); 665a587d139SMark both = PETSC_TRUE; 666a587d139SMark } 667a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 668a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 669a587d139SMark if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 670a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 671a587d139SMark PetscFunctionReturn(0); 672a587d139SMark } 673a587d139SMark 674f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 675a587d139SMark { 676a587d139SMark PetscErrorCode ierr; 677a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 678a587d139SMark 679a587d139SMark PetscFunctionBegin; 680f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 681a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 682a587d139SMark PetscFunctionReturn(0); 683a587d139SMark } 684f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 685a587d139SMark PetscBool e; 686a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 687a587d139SMark if (e) { 688a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 689f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 690a587d139SMark } 691a587d139SMark } 692a587d139SMark if (str != SAME_NONZERO_PATTERN) { 693a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 694a587d139SMark PetscFunctionReturn(0); 695a587d139SMark } else { 696a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 697a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 698a587d139SMark PetscFunctionReturn(0); 699f0cf5187SStefano Zampini } else { 700a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 701f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 702a587d139SMark } 703a587d139SMark Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 704a587d139SMark Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 705a587d139SMark if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 706f0cf5187SStefano Zampini KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 707f0cf5187SStefano Zampini Y->offloadmask = PETSC_OFFLOAD_GPU; 708f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 709a587d139SMark ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 710f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 711f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 712a587d139SMark } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 713a587d139SMark } 714a587d139SMark PetscFunctionReturn(0); 715a587d139SMark } 716a587d139SMark 7178f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 7188f7e8f9dSMark Adams { 7198f7e8f9dSMark Adams PetscErrorCode ierr; 7208f7e8f9dSMark Adams 7218f7e8f9dSMark Adams PetscFunctionBegin; 7228f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 7238f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 7248f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 7258f7e8f9dSMark Adams PetscFunctionReturn(0); 7268f7e8f9dSMark Adams } 7278f7e8f9dSMark Adams 7288c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 7298c3ff71bSJunchao Zhang { 7308c3ff71bSJunchao Zhang PetscFunctionBegin; 7316f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 7326f3d89d0SStefano Zampini 733a587d139SMark A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 7348c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 7358c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 7368c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 737a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 738f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 739a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 740a3f881fbSStefano Zampini //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 7418c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 7428c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 7438c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 7448c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 7458c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 7468c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 747152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 7488c3ff71bSJunchao Zhang PetscFunctionReturn(0); 7498c3ff71bSJunchao Zhang } 7508c3ff71bSJunchao Zhang 7518c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 752152b3e56SJunchao Zhang /*@C 7538c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 7548c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 7558c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 7568c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 7578c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 7588c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 7598c3ff71bSJunchao Zhang 7608c3ff71bSJunchao Zhang Collective 7618c3ff71bSJunchao Zhang 7628c3ff71bSJunchao Zhang Input Parameters: 7638c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 7648c3ff71bSJunchao Zhang . m - number of rows 7658c3ff71bSJunchao Zhang . n - number of columns 7668c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 7678c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 7688c3ff71bSJunchao Zhang (possibly different for each row) or NULL 7698c3ff71bSJunchao Zhang 7708c3ff71bSJunchao Zhang Output Parameter: 7718c3ff71bSJunchao Zhang . A - the matrix 7728c3ff71bSJunchao Zhang 7738c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 7748c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 7758c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 7768c3ff71bSJunchao Zhang 7778c3ff71bSJunchao Zhang Notes: 7788c3ff71bSJunchao Zhang If nnz is given then nz is ignored 7798c3ff71bSJunchao Zhang 7808c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 7818c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 7828c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 7838c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 7848c3ff71bSJunchao Zhang 7858c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 7868c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 7878c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 7888c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 7898c3ff71bSJunchao Zhang 7908c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 7918c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 7928c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 7938c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 7948c3ff71bSJunchao Zhang 7958c3ff71bSJunchao Zhang Level: intermediate 7968c3ff71bSJunchao Zhang 7978c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 7988c3ff71bSJunchao Zhang @*/ 7998c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 8008c3ff71bSJunchao Zhang { 8018c3ff71bSJunchao Zhang PetscErrorCode ierr; 8028c3ff71bSJunchao Zhang 8038c3ff71bSJunchao Zhang PetscFunctionBegin; 8048c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 8058c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 8068c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 8078c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 8088c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 8098c3ff71bSJunchao Zhang PetscFunctionReturn(0); 8108c3ff71bSJunchao Zhang } 811930e68a5SMark Adams 812930e68a5SMark Adams // factorizations 8138f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 8148f7e8f9dSMark Adams // 8158f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 8168f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 8178f7e8f9dSMark Adams // 8188f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 819930e68a5SMark Adams { 8208f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 8218f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 8228f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 823930e68a5SMark Adams PetscErrorCode ierr; 8248f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 8258f7e8f9dSMark Adams const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 8268f7e8f9dSMark Adams const PetscScalar *aa_d = aijkok->a_d.data(); 8278f7e8f9dSMark Adams PetscScalar *ba_d = baijkok->a_d.data(); 8288f7e8f9dSMark Adams PetscBool row_identity,col_identity; 8298f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 8308f7e8f9dSMark Adams PetscContainer container; 831930e68a5SMark Adams 832930e68a5SMark Adams PetscFunctionBegin; 8338f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 8348f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 8358f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 8368f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 8378f7e8f9dSMark Adams if (container) { 8388f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 8398f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 8408f7e8f9dSMark Adams Nf = (*pNf)%1000; 8418f7e8f9dSMark Adams nv = (*pNf)/1000; 8428f7e8f9dSMark Adams if (nv>0) nVec = nv; 8438f7e8f9dSMark Adams } else Nf = 1; 8448f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 8458f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 8468f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 8478f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 8488f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 8498f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 8508f7e8f9dSMark Adams #endif 8518f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 8528f7e8f9dSMark Adams { 8538f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 8548f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 8558f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 8568f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 8578f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 8588f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 8598f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 8608f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 8618f7e8f9dSMark Adams size_t flops_h = 0.0; 8628f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 8638f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 8648f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 8658f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 8668f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 8678f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 8688f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 8698f7e8f9dSMark Adams // Fill A --> fact 8708f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 8718f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 8728f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 8738f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 8748f7e8f9dSMark Adams // zero rows of B 8758f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 8768f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 8778f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 8788f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 8798f7e8f9dSMark Adams /* zero (unfactored row) */ 8808f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 8818f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 8828f7e8f9dSMark Adams }); 8838f7e8f9dSMark Adams // copy A into B 8848f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 8858f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 8868f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 8878f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 8888f7e8f9dSMark Adams /* load in initial (unfactored row) */ 8898f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 8908f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 8918f7e8f9dSMark Adams PetscScalar vala = av[j]; 8928f7e8f9dSMark Adams if (colb == rowb) { 8938f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 8948f7e8f9dSMark Adams } else { 8958f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 8968f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 8978f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 8988f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 8998f7e8f9dSMark Adams if (pbj[j] == colb) { 9008f7e8f9dSMark Adams pba[j] = vala; 9018f7e8f9dSMark Adams set++; 9028f7e8f9dSMark Adams break; 9038f7e8f9dSMark Adams } 9048f7e8f9dSMark Adams } 9058f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 9068f7e8f9dSMark Adams } 9078f7e8f9dSMark Adams } 9088f7e8f9dSMark Adams }); 9098f7e8f9dSMark Adams }); 9108f7e8f9dSMark Adams Kokkos::fence(); 911930e68a5SMark Adams 9128f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 9138f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 9148f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 9158f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 9168f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 9178f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 9188f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 9198f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 9208f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 9218f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 9228f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 9238f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 9248f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 9258f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 9268f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 9278f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 9288f7e8f9dSMark Adams else { 9298f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 9308f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 9318f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 9328f7e8f9dSMark Adams size_t st_idx; 9338f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 9348f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 9358f7e8f9dSMark Adams // get column, there has got to be a better way 9368f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 9378f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 9388f7e8f9dSMark Adams if (pjL[j] == ii) { 9398f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 9408f7e8f9dSMark Adams idx = j; // output 9418f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 9428f7e8f9dSMark Adams } 9438f7e8f9dSMark Adams }, st_idx); 9448f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 9458f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 9468f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 9478f7e8f9dSMark Adams // U(i+1,:end) 9488f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 9498f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 9508f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 9518f7e8f9dSMark Adams if (col==myk) { 9528f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 9538f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 9548f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 9558f7e8f9dSMark Adams } else { 9568f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 9578f7e8f9dSMark Adams PetscInt high, low; 9588f7e8f9dSMark Adams const PetscInt *startj; 9598f7e8f9dSMark Adams if (col<myk) { // L 9608f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 9618f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 9628f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 9638f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 9648f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 9658f7e8f9dSMark Adams } else { 9668f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 9678f7e8f9dSMark Adams start = ba_d + idx; 9688f7e8f9dSMark Adams startj= bj_d + idx; 9698f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 9708f7e8f9dSMark Adams } 9718f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 9728f7e8f9dSMark Adams low = 0; 9738f7e8f9dSMark Adams high = (PetscInt)(end-start); 9748f7e8f9dSMark Adams while (high-low > 5) { 9758f7e8f9dSMark Adams int t = (low+high)/2; 9768f7e8f9dSMark Adams if (startj[t] > col) high = t; 9778f7e8f9dSMark Adams else low = t; 9788f7e8f9dSMark Adams } 9798f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 9808f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 9818f7e8f9dSMark Adams } 9828f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 9838f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 9848f7e8f9dSMark Adams } 9858f7e8f9dSMark Adams }); 9868f7e8f9dSMark Adams } 9878f7e8f9dSMark Adams } 9888f7e8f9dSMark Adams }); 9898f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 9908f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 9918f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 9928f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 9938f7e8f9dSMark Adams }); 9948f7e8f9dSMark Adams Kokkos::fence(); 9958f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 9968f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 9978f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 9988f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 9998f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 10008f7e8f9dSMark Adams #endif 10018f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 10028f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 10038f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 10048f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 10058f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 10068f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 10078f7e8f9dSMark Adams if (i < end) { 10088f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 10098f7e8f9dSMark Adams *pv = 1.0/(*pv); 10108f7e8f9dSMark Adams } 10118f7e8f9dSMark Adams }); 10128f7e8f9dSMark Adams }); 10138f7e8f9dSMark Adams } 10148f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 10158f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 10168f7e8f9dSMark Adams #endif 10178f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 10188f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 10198f7e8f9dSMark Adams 10208f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 10218f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 10228f7e8f9dSMark Adams if (b->inode.size) { 10238f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 10248f7e8f9dSMark Adams } else if (row_identity && col_identity) { 10258f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 10268f7e8f9dSMark Adams } else { 10278f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 10288f7e8f9dSMark Adams } 10298f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 10308f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 10318f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 10328f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 10338f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 10348f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 10358f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 10368f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 10378f7e8f9dSMark Adams 1038930e68a5SMark Adams PetscFunctionReturn(0); 1039930e68a5SMark Adams } 1040930e68a5SMark Adams 1041930e68a5SMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1042930e68a5SMark Adams { 1043930e68a5SMark Adams PetscErrorCode ierr; 1044930e68a5SMark Adams 1045930e68a5SMark Adams PetscFunctionBegin; 1046930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1047930e68a5SMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 1048930e68a5SMark Adams PetscFunctionReturn(0); 1049930e68a5SMark Adams } 1050930e68a5SMark Adams 10518f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 10528f7e8f9dSMark Adams { 10538f7e8f9dSMark Adams PetscErrorCode ierr; 10548f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 10558f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1056930e68a5SMark Adams 10578f7e8f9dSMark Adams PetscFunctionBegin; 10588f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 10598f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 10608f7e8f9dSMark Adams // move B data into Kokkos 10618f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 10628f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 10638f7e8f9dSMark Adams { 10648f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 10658f7e8f9dSMark Adams if (!baijkok->diag_d) { 10668f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1067152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 10688f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 10698f7e8f9dSMark Adams } 10708f7e8f9dSMark Adams } 10718f7e8f9dSMark Adams PetscFunctionReturn(0); 10728f7e8f9dSMark Adams } 10738f7e8f9dSMark Adams 10748f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 10758f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*); 1076930e68a5SMark Adams 1077930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1078930e68a5SMark Adams { 1079930e68a5SMark Adams PetscErrorCode ierr; 1080930e68a5SMark Adams 1081930e68a5SMark Adams PetscFunctionBegin; 1082930e68a5SMark Adams ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 10838f7e8f9dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1084930e68a5SMark Adams PetscFunctionReturn(0); 1085930e68a5SMark Adams } 1086930e68a5SMark Adams 1087930e68a5SMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 1088930e68a5SMark Adams { 1089930e68a5SMark Adams PetscFunctionBegin; 1090930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1091930e68a5SMark Adams PetscFunctionReturn(0); 1092930e68a5SMark Adams } 1093930e68a5SMark Adams 10948f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice 10958f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 10968f7e8f9dSMark Adams { 10978f7e8f9dSMark Adams PetscFunctionBegin; 10988f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 10998f7e8f9dSMark Adams PetscFunctionReturn(0); 11008f7e8f9dSMark Adams } 11018f7e8f9dSMark Adams 1102930e68a5SMark Adams /*MC 1103930e68a5SMark Adams MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1104930e68a5SMark Adams on a single GPU of type, seqaijkokkos, aijkokkos. 1105930e68a5SMark Adams 1106930e68a5SMark Adams Level: beginner 1107930e68a5SMark Adams 1108930e68a5SMark Adams .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 1109930e68a5SMark Adams M*/ 1110930e68a5SMark Adams 1111930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 1112930e68a5SMark Adams { 1113930e68a5SMark Adams PetscErrorCode ierr; 1114930e68a5SMark Adams PetscInt n = A->rmap->n; 1115930e68a5SMark Adams 1116930e68a5SMark Adams PetscFunctionBegin; 1117930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1118930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1119930e68a5SMark Adams (*B)->factortype = ftype; 1120*f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 1121930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1122930e68a5SMark Adams 11238f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1124930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1125930e68a5SMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 1126930e68a5SMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1127930e68a5SMark Adams 1128930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1129930e68a5SMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 1130930e68a5SMark Adams PetscFunctionReturn(0); 1131930e68a5SMark Adams } 11328f7e8f9dSMark Adams 11338f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 11348f7e8f9dSMark Adams { 11358f7e8f9dSMark Adams PetscErrorCode ierr; 11368f7e8f9dSMark Adams PetscInt n = A->rmap->n; 11378f7e8f9dSMark Adams 11388f7e8f9dSMark Adams PetscFunctionBegin; 11398f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 11408f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 11418f7e8f9dSMark Adams (*B)->factortype = ftype; 1142*f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 11438f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11448f7e8f9dSMark Adams 11458f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 11468f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 11478f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 11488f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 11498f7e8f9dSMark Adams 11508f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 11518f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 11528f7e8f9dSMark Adams PetscFunctionReturn(0); 11538f7e8f9dSMark Adams } 1154