18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h" 28c3ff71bSJunchao Zhang #include <petscsystypes.h> 38c3ff71bSJunchao Zhang #include <petscerror.h> 49f114eb2SJunchao Zhang #include <petscveckokkos.hpp> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 7f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 118c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 128c3ff71bSJunchao Zhang 138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 14a587d139SMark #include <petscmat.h> 158c3ff71bSJunchao Zhang 168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 178c3ff71bSJunchao Zhang 188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 198c3ff71bSJunchao Zhang { 208c3ff71bSJunchao Zhang PetscErrorCode ierr; 21a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 228c3ff71bSJunchao Zhang 238c3ff71bSJunchao Zhang PetscFunctionBegin; 248c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 25a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 26a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 27a587d139SMark } 288c3ff71bSJunchao Zhang /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */ 298c3ff71bSJunchao Zhang PetscFunctionReturn(0); 308c3ff71bSJunchao Zhang } 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 338c3ff71bSJunchao Zhang { 348c3ff71bSJunchao Zhang Mat_SeqAIJ *aijseq = (Mat_SeqAIJ*)A->data; 358c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 368c3ff71bSJunchao Zhang PetscInt nrows = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz; 378c3ff71bSJunchao Zhang 388c3ff71bSJunchao Zhang PetscFunctionBegin; 39a3f881fbSStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 408c3ff71bSJunchao Zhang /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */ 418c3ff71bSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { 428c3ff71bSJunchao Zhang delete aijkok; 438c3ff71bSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a); 448c3ff71bSJunchao Zhang aijkok->nonzerostate = A->nonzerostate; 458c3ff71bSJunchao Zhang A->spptr = aijkok; 468c3ff71bSJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */ 478c3ff71bSJunchao Zhang Kokkos::deep_copy(aijkok->a_d,aijkok->a_h); 488c3ff71bSJunchao Zhang } 498c3ff71bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 508c3ff71bSJunchao Zhang PetscFunctionReturn(0); 518c3ff71bSJunchao Zhang } 528c3ff71bSJunchao Zhang 53f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 54f0cf5187SStefano Zampini { 55f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 56f0cf5187SStefano Zampini 57f0cf5187SStefano Zampini PetscFunctionBegin; 58f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 59f0cf5187SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 60f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 61f0cf5187SStefano Zampini Kokkos::deep_copy(aijkok->a_h,aijkok->a_d); 62f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 63f0cf5187SStefano Zampini } 64f0cf5187SStefano Zampini PetscFunctionReturn(0); 65f0cf5187SStefano Zampini } 66f0cf5187SStefano Zampini 67f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 68f0cf5187SStefano Zampini { 69f0cf5187SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 70f0cf5187SStefano Zampini PetscErrorCode ierr; 71f0cf5187SStefano Zampini 72f0cf5187SStefano Zampini PetscFunctionBegin; 73f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 74f0cf5187SStefano Zampini *array = a->a; 75f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 76f0cf5187SStefano Zampini PetscFunctionReturn(0); 77f0cf5187SStefano Zampini } 78f0cf5187SStefano Zampini 79a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 80a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat) 81a587d139SMark { 82a587d139SMark Mat_SeqAIJKokkos *aijkok; 83a587d139SMark Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat); 84a587d139SMark 85a587d139SMark PetscFunctionBegin; 86a587d139SMark // ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 87a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 88a587d139SMark if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos"); 89a587d139SMark aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k); 90a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 91a587d139SMark PetscFunctionReturn(0); 92a587d139SMark } 93a587d139SMark 94a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 95a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat) 96a587d139SMark { 97a587d139SMark Mat_SeqAIJKokkos *aijkok; 98a587d139SMark 99a587d139SMark PetscFunctionBegin; 100a587d139SMark aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 101a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 102a587d139SMark *d_mat = aijkok->device_mat_d.data(); 103a587d139SMark } else { 104a587d139SMark PetscErrorCode ierr; 105a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 106a587d139SMark *d_mat = NULL; 107a587d139SMark } 108a587d139SMark PetscFunctionReturn(0); 109a587d139SMark } 110a587d139SMark 1118c3ff71bSJunchao Zhang /* y = A x */ 1128c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1138c3ff71bSJunchao Zhang { 1148c3ff71bSJunchao Zhang PetscErrorCode ierr; 1158c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1168c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv; 1178c3ff71bSJunchao Zhang PetscScalarViewDevice_t yv; 1188c3ff71bSJunchao Zhang 1198c3ff71bSJunchao Zhang PetscFunctionBegin; 1208c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1218c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1228c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 1238c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1248c3ff71bSJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 1258c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1268c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 1271e566348SJunchao Zhang /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */ 128bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 1291e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 1308c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1318c3ff71bSJunchao Zhang } 1328c3ff71bSJunchao Zhang 1338c3ff71bSJunchao Zhang /* y = A^T x */ 1348c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1358c3ff71bSJunchao Zhang { 1368c3ff71bSJunchao Zhang PetscErrorCode ierr; 1378c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1388c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv; 1398c3ff71bSJunchao Zhang PetscScalarViewDevice_t yv; 1408c3ff71bSJunchao Zhang 1418c3ff71bSJunchao Zhang PetscFunctionBegin; 1428c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1438c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1448c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 1458c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1468c3ff71bSJunchao Zhang KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 1478c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1488c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 149bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 1501e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 1518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1528c3ff71bSJunchao Zhang } 1538c3ff71bSJunchao Zhang 1548c3ff71bSJunchao Zhang /* y = A^H x */ 1558c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 1568c3ff71bSJunchao Zhang { 1578c3ff71bSJunchao Zhang PetscErrorCode ierr; 1588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1598c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv; 1608c3ff71bSJunchao Zhang PetscScalarViewDevice_t yv; 1618c3ff71bSJunchao Zhang 1628c3ff71bSJunchao Zhang PetscFunctionBegin; 1638c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1648c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1658c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr); 1668c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1678c3ff71bSJunchao Zhang KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 1688c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1698c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr); 170bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 1711e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 1728c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1738c3ff71bSJunchao Zhang } 1748c3ff71bSJunchao Zhang 1758c3ff71bSJunchao Zhang /* z = A x + y */ 1768c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 1778c3ff71bSJunchao Zhang { 1788c3ff71bSJunchao Zhang PetscErrorCode ierr; 1798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1808c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv,yv; 1818c3ff71bSJunchao Zhang PetscScalarViewDevice_t zv; 1828c3ff71bSJunchao Zhang 1838c3ff71bSJunchao Zhang PetscFunctionBegin; 1848c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1858c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1868c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 1878c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 1888c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 1898c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1908c3ff71bSJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 1918c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 1928c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 1938c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 194bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 1951e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 1968c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1978c3ff71bSJunchao Zhang } 1988c3ff71bSJunchao Zhang 1998c3ff71bSJunchao Zhang /* z = A^T x + y */ 2008c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2018c3ff71bSJunchao Zhang { 2028c3ff71bSJunchao Zhang PetscErrorCode ierr; 2038c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 2048c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv,yv; 2058c3ff71bSJunchao Zhang PetscScalarViewDevice_t zv; 2068c3ff71bSJunchao Zhang 2078c3ff71bSJunchao Zhang PetscFunctionBegin; 2088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 2098c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 2108c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 2118c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 2128c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2138c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 2148c3ff71bSJunchao Zhang KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 2158c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 2168c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 2178c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 218bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 2191e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 2208c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2218c3ff71bSJunchao Zhang } 2228c3ff71bSJunchao Zhang 2238c3ff71bSJunchao Zhang /* z = A^H x + y */ 2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 2258c3ff71bSJunchao Zhang { 2268c3ff71bSJunchao Zhang PetscErrorCode ierr; 2278c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 2288c3ff71bSJunchao Zhang ConstPetscScalarViewDevice_t xv,yv; 2298c3ff71bSJunchao Zhang PetscScalarViewDevice_t zv; 2308c3ff71bSJunchao Zhang 2318c3ff71bSJunchao Zhang PetscFunctionBegin; 2328c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 2338c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr); 2348c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr); 2358c3ff71bSJunchao Zhang ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr); 2368c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 2378c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 2388c3ff71bSJunchao Zhang KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 2398c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr); 2408c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr); 2418c3ff71bSJunchao Zhang ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr); 242bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 2431e566348SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr); 2448c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2458c3ff71bSJunchao Zhang } 2468c3ff71bSJunchao Zhang 2473d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2488c3ff71bSJunchao Zhang { 2498c3ff71bSJunchao Zhang PetscErrorCode ierr; 2508c3ff71bSJunchao Zhang Mat B; 251c58ef05eSStefano Zampini Mat_SeqAIJ *aij; 2528c3ff71bSJunchao Zhang 2538c3ff71bSJunchao Zhang PetscFunctionBegin; 254a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 2558c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */ 2568c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2578c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 2588c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2598c3ff71bSJunchao Zhang } 2608c3ff71bSJunchao Zhang 2618c3ff71bSJunchao Zhang B = *newmat; 2628c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2638c3ff71bSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 2648c3ff71bSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 2650d8bce8aSStefano Zampini ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr); 266f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr); 267c58ef05eSStefano Zampini /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */ 268c58ef05eSStefano Zampini aij = (Mat_SeqAIJ*)B->data; 269c58ef05eSStefano Zampini aij->inode.use = PETSC_FALSE; 2708c3ff71bSJunchao Zhang 2718c3ff71bSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 2728c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2738c3ff71bSJunchao Zhang } 2748c3ff71bSJunchao Zhang 2758c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B) 2768c3ff71bSJunchao Zhang { 2778c3ff71bSJunchao Zhang PetscErrorCode ierr; 2788c3ff71bSJunchao Zhang 2798c3ff71bSJunchao Zhang PetscFunctionBegin; 2808c3ff71bSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2818c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2828c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2838c3ff71bSJunchao Zhang } 2848c3ff71bSJunchao Zhang 2858c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 2868c3ff71bSJunchao Zhang { 2878c3ff71bSJunchao Zhang PetscErrorCode ierr; 2888c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang PetscFunctionBegin; 291*8f7e8f9dSMark Adams if (aijkok) { 292*8f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 293a587d139SMark delete aijkok->colmap_d; 294a587d139SMark delete aijkok->i_uncompressed_d; 295a587d139SMark } 296*8f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 297*8f7e8f9dSMark Adams } 2988c3ff71bSJunchao Zhang delete aijkok; 299f0cf5187SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr); 3008c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3018c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3028c3ff71bSJunchao Zhang } 3038c3ff71bSJunchao Zhang 304a3f881fbSStefano Zampini #if 0 305a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data) 306a3f881fbSStefano Zampini { 307a3f881fbSStefano Zampini MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data); 308a3f881fbSStefano Zampini 309a3f881fbSStefano Zampini PetscFunctionBegin; 310a3f881fbSStefano Zampini delete kh; 311a3f881fbSStefano Zampini PetscFunctionReturn(0); 312a3f881fbSStefano Zampini } 313a3f881fbSStefano Zampini 314a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 315a3f881fbSStefano Zampini { 316a3f881fbSStefano Zampini Mat_Product *product = C->product; 317a3f881fbSStefano Zampini Mat A,B; 318a3f881fbSStefano Zampini MatProductType ptype; 319a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 320a3f881fbSStefano Zampini bool tA,tB; 321a3f881fbSStefano Zampini PetscErrorCode ierr; 322a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 323a3f881fbSStefano Zampini Mat_SeqAIJ *c; 324a3f881fbSStefano Zampini 325a3f881fbSStefano Zampini PetscFunctionBegin; 326a3f881fbSStefano Zampini MatCheckProduct(C,1); 327a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 328a3f881fbSStefano Zampini A = product->A; 329a3f881fbSStefano Zampini B = product->B; 330a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 331a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 332a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 333a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 334a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 335a3f881fbSStefano Zampini kh = static_cast<MatMatKernelHandle_t*>(C->product->data); 336a3f881fbSStefano Zampini ptype = product->type; 337a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 338a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 339a3f881fbSStefano Zampini switch (ptype) { 340a3f881fbSStefano Zampini case MATPRODUCT_AB: 341a3f881fbSStefano Zampini tA = false; 342a3f881fbSStefano Zampini tB = false; 343a3f881fbSStefano Zampini break; 344a3f881fbSStefano Zampini case MATPRODUCT_AtB: 345a3f881fbSStefano Zampini tA = true; 346a3f881fbSStefano Zampini tB = false; 347a3f881fbSStefano Zampini break; 348a3f881fbSStefano Zampini case MATPRODUCT_ABt: 349a3f881fbSStefano Zampini tA = false; 350a3f881fbSStefano Zampini tB = true; 351a3f881fbSStefano Zampini break; 352a3f881fbSStefano Zampini default: 353a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 354a3f881fbSStefano Zampini } 355a3f881fbSStefano Zampini 356a3f881fbSStefano Zampini KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr); 357a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 358a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 359a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 360a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 361a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 362a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 363a3f881fbSStefano Zampini c->reallocs = 0; 364a3f881fbSStefano Zampini C->info.mallocs += 0; 365a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 366a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 367a3f881fbSStefano Zampini C->num_ass++; 368a3f881fbSStefano Zampini /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */ 369a3f881fbSStefano Zampini // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray() 370a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_BOTH; 371a3f881fbSStefano Zampini // Also, we should add support to copy back from device to host 372a3f881fbSStefano Zampini PetscFunctionReturn(0); 373a3f881fbSStefano Zampini } 374a3f881fbSStefano Zampini 375a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 376a3f881fbSStefano Zampini { 377a3f881fbSStefano Zampini Mat_Product *product = C->product; 378a3f881fbSStefano Zampini Mat A,B; 379a3f881fbSStefano Zampini MatProductType ptype; 380a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 381a3f881fbSStefano Zampini PetscInt m,n,k; 382a3f881fbSStefano Zampini bool tA,tB; 383a3f881fbSStefano Zampini PetscErrorCode ierr; 384a3f881fbSStefano Zampini Mat_SeqAIJ *c; 385a3f881fbSStefano Zampini MatMatKernelHandle_t *kh; 386a3f881fbSStefano Zampini 387a3f881fbSStefano Zampini PetscFunctionBegin; 388a3f881fbSStefano Zampini MatCheckProduct(C,1); 389a3f881fbSStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 390a3f881fbSStefano Zampini A = product->A; 391a3f881fbSStefano Zampini B = product->B; 392a3f881fbSStefano Zampini // TODO only copy the i,j data, not the values 393a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 394a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 395a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 396a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 397a3f881fbSStefano Zampini ptype = product->type; 398a3f881fbSStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 399a3f881fbSStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 400a3f881fbSStefano Zampini switch (ptype) { 401a3f881fbSStefano Zampini case MATPRODUCT_AB: 402a3f881fbSStefano Zampini tA = false; 403a3f881fbSStefano Zampini tB = false; 404a3f881fbSStefano Zampini m = A->rmap->n; 405a3f881fbSStefano Zampini n = B->cmap->n; 406a3f881fbSStefano Zampini break; 407a3f881fbSStefano Zampini case MATPRODUCT_AtB: 408a3f881fbSStefano Zampini tA = true; 409a3f881fbSStefano Zampini tB = false; 410a3f881fbSStefano Zampini m = A->cmap->n; 411a3f881fbSStefano Zampini n = B->cmap->n; 412a3f881fbSStefano Zampini break; 413a3f881fbSStefano Zampini case MATPRODUCT_ABt: 414a3f881fbSStefano Zampini tA = false; 415a3f881fbSStefano Zampini tB = true; 416a3f881fbSStefano Zampini m = A->rmap->n; 417a3f881fbSStefano Zampini n = B->rmap->n; 418a3f881fbSStefano Zampini break; 419a3f881fbSStefano Zampini default: 420a3f881fbSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 421a3f881fbSStefano Zampini } 422a3f881fbSStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 423a3f881fbSStefano Zampini ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr); 424a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 425a3f881fbSStefano Zampini 426a3f881fbSStefano Zampini kh = new MatMatKernelHandle_t; 427a3f881fbSStefano Zampini // TODO SZ: ADD RUNTIME SELECTION OF THESE 428a3f881fbSStefano Zampini kh->set_team_work_size(16); 429a3f881fbSStefano Zampini kh->set_dynamic_scheduling(true); 430a3f881fbSStefano Zampini // Select an spgemm algorithm, limited by configuration at compile-time and 431a3f881fbSStefano Zampini // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, 432a3f881fbSStefano Zampini // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL} 433a3f881fbSStefano Zampini std::string myalg("SPGEMM_KK_MEMORY"); 434a3f881fbSStefano Zampini kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg)); 435a3f881fbSStefano Zampini 436a3f881fbSStefano Zampini ///////////////////////////////////// 437a3f881fbSStefano Zampini // TODO JZ 438a3f881fbSStefano Zampini ckok = NULL; //new Mat_SeqAIJKokkos(); 439a3f881fbSStefano Zampini C->spptr = ckok; 440a3f881fbSStefano Zampini KokkosCsrMatrix_t ccsr; // here only to have the code compile 441a3f881fbSStefano Zampini KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr); 442a3f881fbSStefano Zampini //cerr = WaitForKOKKOS();CHKERRCUDA(cerr); 443a3f881fbSStefano Zampini //c->nz = get_nnz_from_ccsr 444a3f881fbSStefano Zampini ////////////////////////////////////// 445a3f881fbSStefano Zampini c->singlemalloc = PETSC_FALSE; 446a3f881fbSStefano Zampini c->free_a = PETSC_TRUE; 447a3f881fbSStefano Zampini c->free_ij = PETSC_TRUE; 448a3f881fbSStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 449a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 450a3f881fbSStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 451a3f881fbSStefano Zampini //////////////////////////////////// 452a3f881fbSStefano Zampini // TODO JZ copy from device to c->i and c->j 453a3f881fbSStefano Zampini //////////////////////////////////// 454a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 455a3f881fbSStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 456a3f881fbSStefano Zampini c->maxnz = c->nz; 457a3f881fbSStefano Zampini c->nonzerorowcnt = 0; 458a3f881fbSStefano Zampini c->rmax = 0; 459a3f881fbSStefano Zampini for (k = 0; k < m; k++) { 460a3f881fbSStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 461a3f881fbSStefano Zampini c->ilen[k] = c->imax[k] = nn; 462a3f881fbSStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 463a3f881fbSStefano Zampini c->rmax = PetscMax(c->rmax,nn); 464a3f881fbSStefano Zampini } 465a3f881fbSStefano Zampini 466a3f881fbSStefano Zampini C->nonzerostate++; 467a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 468a3f881fbSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 469a3f881fbSStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 470a3f881fbSStefano Zampini ckok->nonzerostate = C->nonzerostate; 471a3f881fbSStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 472a3f881fbSStefano Zampini C->preallocated = PETSC_TRUE; 473a3f881fbSStefano Zampini C->assembled = PETSC_FALSE; 474a3f881fbSStefano Zampini C->was_assembled = PETSC_FALSE; 475a3f881fbSStefano Zampini 476a3f881fbSStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 477a3f881fbSStefano Zampini C->product->data = kh; 478a3f881fbSStefano Zampini C->product->destroy = MatMatKernelHandleDestroy_Private; 479a3f881fbSStefano Zampini PetscFunctionReturn(0); 480a3f881fbSStefano Zampini } 481a3f881fbSStefano Zampini 482a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 483a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 484a3f881fbSStefano Zampini { 485a3f881fbSStefano Zampini Mat_Product *product = mat->product; 486a3f881fbSStefano Zampini PetscErrorCode ierr; 487a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 488a3f881fbSStefano Zampini 489a3f881fbSStefano Zampini PetscFunctionBegin; 490a3f881fbSStefano Zampini MatCheckProduct(mat,1); 491a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 492a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 493a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 494a3f881fbSStefano Zampini } 495a3f881fbSStefano Zampini if (Biskok && Ciskok) { 496a3f881fbSStefano Zampini switch (product->type) { 497a3f881fbSStefano Zampini case MATPRODUCT_AB: 498a3f881fbSStefano Zampini case MATPRODUCT_AtB: 499a3f881fbSStefano Zampini case MATPRODUCT_ABt: 500a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 501a3f881fbSStefano Zampini break; 502a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 503a3f881fbSStefano Zampini case MATPRODUCT_RARt: 504a3f881fbSStefano Zampini case MATPRODUCT_ABC: 505a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 506a3f881fbSStefano Zampini break; 507a3f881fbSStefano Zampini default: 508a3f881fbSStefano Zampini break; 509a3f881fbSStefano Zampini } 510a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 511a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 512a3f881fbSStefano Zampini } 513a3f881fbSStefano Zampini PetscFunctionReturn(0); 514a3f881fbSStefano Zampini } 515a3f881fbSStefano Zampini #endif 516a3f881fbSStefano Zampini 5178c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 5188c3ff71bSJunchao Zhang { 5198c3ff71bSJunchao Zhang PetscErrorCode ierr; 5208c3ff71bSJunchao Zhang 5218c3ff71bSJunchao Zhang PetscFunctionBegin; 5228c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 5238c3ff71bSJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 5248c3ff71bSJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 5258c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5268c3ff71bSJunchao Zhang } 5278c3ff71bSJunchao Zhang 528a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is) 529a587d139SMark { 530a587d139SMark PetscErrorCode ierr; 531a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 532a587d139SMark PetscFunctionBegin; 533a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 534a587d139SMark SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported"); 535a587d139SMark } 536a587d139SMark ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr); 537a587d139SMark PetscFunctionReturn(0); 538a587d139SMark } 539a587d139SMark 540f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 541f0cf5187SStefano Zampini { 542f0cf5187SStefano Zampini PetscErrorCode ierr; 543f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 544f0cf5187SStefano Zampini 545f0cf5187SStefano Zampini PetscFunctionBegin; 546f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 547f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548f0cf5187SStefano Zampini KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d); 549f0cf5187SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 550f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 551f0cf5187SStefano Zampini ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr); 552f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 553f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 554f0cf5187SStefano Zampini PetscFunctionReturn(0); 555f0cf5187SStefano Zampini } 556f0cf5187SStefano Zampini 557a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 558a587d139SMark { 559a587d139SMark PetscErrorCode ierr; 560a587d139SMark PetscBool both = PETSC_FALSE; 561a587d139SMark Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 562a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 563a587d139SMark 564a587d139SMark PetscFunctionBegin; 565a587d139SMark if (aijkok && aijkok->a_d.data()) { 566f0cf5187SStefano Zampini KokkosBlas::fill(aijkok->a_d,0.0); 567a587d139SMark both = PETSC_TRUE; 568a587d139SMark } 569a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 570a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 571a587d139SMark if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 572a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 573a587d139SMark PetscFunctionReturn(0); 574a587d139SMark } 575a587d139SMark 576f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 577a587d139SMark { 578a587d139SMark PetscErrorCode ierr; 579a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 580a587d139SMark 581a587d139SMark PetscFunctionBegin; 582f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 583a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 584a587d139SMark PetscFunctionReturn(0); 585a587d139SMark } 586f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 587a587d139SMark PetscBool e; 588a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 589a587d139SMark if (e) { 590a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 591f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 592a587d139SMark } 593a587d139SMark } 594a587d139SMark if (str != SAME_NONZERO_PATTERN) { 595a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 596a587d139SMark PetscFunctionReturn(0); 597a587d139SMark } else { 598a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) { 599a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 600a587d139SMark PetscFunctionReturn(0); 601f0cf5187SStefano Zampini } else { 602a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 603f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 604a587d139SMark } 605a587d139SMark Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 606a587d139SMark Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 607a587d139SMark if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) { 608f0cf5187SStefano Zampini KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d); 609f0cf5187SStefano Zampini Y->offloadmask = PETSC_OFFLOAD_GPU; 610f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 611a587d139SMark ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr); 612f0cf5187SStefano Zampini // TODO Remove: this can be removed once we implement matmat operations with KOKKOS 613f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr); 614a587d139SMark } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???"); 615a587d139SMark } 616a587d139SMark PetscFunctionReturn(0); 617a587d139SMark } 618a587d139SMark 619*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info) 620*8f7e8f9dSMark Adams { 621*8f7e8f9dSMark Adams PetscErrorCode ierr; 622*8f7e8f9dSMark Adams 623*8f7e8f9dSMark Adams PetscFunctionBegin; 624*8f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 625*8f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 626*8f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 627*8f7e8f9dSMark Adams PetscFunctionReturn(0); 628*8f7e8f9dSMark Adams } 629*8f7e8f9dSMark Adams 6308c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 6318c3ff71bSJunchao Zhang { 6328c3ff71bSJunchao Zhang PetscFunctionBegin; 6336f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6346f3d89d0SStefano Zampini 635a587d139SMark A->ops->setvalues = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */ 6368c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 6378c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 6388c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 639a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 640f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 641a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 642a3f881fbSStefano Zampini //A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 6438c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 6448c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 6458c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 6468c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 6478c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 6488c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 6498c3ff71bSJunchao Zhang PetscFunctionReturn(0); 6508c3ff71bSJunchao Zhang } 6518c3ff71bSJunchao Zhang 6528c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 6538c3ff71bSJunchao Zhang /*C 6548c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 6558c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 6568c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 6578c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 6588c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 6598c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 6608c3ff71bSJunchao Zhang 6618c3ff71bSJunchao Zhang Collective 6628c3ff71bSJunchao Zhang 6638c3ff71bSJunchao Zhang Input Parameters: 6648c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 6658c3ff71bSJunchao Zhang . m - number of rows 6668c3ff71bSJunchao Zhang . n - number of columns 6678c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 6688c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 6698c3ff71bSJunchao Zhang (possibly different for each row) or NULL 6708c3ff71bSJunchao Zhang 6718c3ff71bSJunchao Zhang Output Parameter: 6728c3ff71bSJunchao Zhang . A - the matrix 6738c3ff71bSJunchao Zhang 6748c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 6758c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 6768c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 6778c3ff71bSJunchao Zhang 6788c3ff71bSJunchao Zhang Notes: 6798c3ff71bSJunchao Zhang If nnz is given then nz is ignored 6808c3ff71bSJunchao Zhang 6818c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 6828c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 6838c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 6848c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 6858c3ff71bSJunchao Zhang 6868c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 6878c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 6888c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 6898c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 6908c3ff71bSJunchao Zhang 6918c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 6928c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 6938c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 6948c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 6958c3ff71bSJunchao Zhang 6968c3ff71bSJunchao Zhang Level: intermediate 6978c3ff71bSJunchao Zhang 6988c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS 6998c3ff71bSJunchao Zhang @*/ 7008c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 7018c3ff71bSJunchao Zhang { 7028c3ff71bSJunchao Zhang PetscErrorCode ierr; 7038c3ff71bSJunchao Zhang 7048c3ff71bSJunchao Zhang PetscFunctionBegin; 7058c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 7068c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 7078c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 7088c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 7098c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 7108c3ff71bSJunchao Zhang PetscFunctionReturn(0); 7118c3ff71bSJunchao Zhang } 712930e68a5SMark Adams 713930e68a5SMark Adams // factorizations 714*8f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 715*8f7e8f9dSMark Adams // 716*8f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 717*8f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 718*8f7e8f9dSMark Adams // 719*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 720930e68a5SMark Adams { 721*8f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 722*8f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 723*8f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 724930e68a5SMark Adams PetscErrorCode ierr; 725*8f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 726*8f7e8f9dSMark Adams const PetscInt n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data(); 727*8f7e8f9dSMark Adams const PetscScalar *aa_d = aijkok->a_d.data(); 728*8f7e8f9dSMark Adams PetscScalar *ba_d = baijkok->a_d.data(); 729*8f7e8f9dSMark Adams PetscBool row_identity,col_identity; 730*8f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 731*8f7e8f9dSMark Adams PetscContainer container; 732930e68a5SMark Adams 733930e68a5SMark Adams PetscFunctionBegin; 734*8f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 735*8f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 736*8f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 737*8f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 738*8f7e8f9dSMark Adams if (container) { 739*8f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 740*8f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 741*8f7e8f9dSMark Adams Nf = (*pNf)%1000; 742*8f7e8f9dSMark Adams nv = (*pNf)/1000; 743*8f7e8f9dSMark Adams if (nv>0) nVec = nv; 744*8f7e8f9dSMark Adams } else Nf = 1; 745*8f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 746*8f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 747*8f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 748*8f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 749*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 750*8f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 751*8f7e8f9dSMark Adams #endif 752*8f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 753*8f7e8f9dSMark Adams { 754*8f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 755*8f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 756*8f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 757*8f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 758*8f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 759*8f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 760*8f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 761*8f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 762*8f7e8f9dSMark Adams size_t flops_h = 0.0; 763*8f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 764*8f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 765*8f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 766*8f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 767*8f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 768*8f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 769*8f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 770*8f7e8f9dSMark Adams // Fill A --> fact 771*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 772*8f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda 773*8f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 774*8f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 775*8f7e8f9dSMark Adams // zero rows of B 776*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 777*8f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 778*8f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 779*8f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 780*8f7e8f9dSMark Adams /* zero (unfactored row) */ 781*8f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 782*8f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 783*8f7e8f9dSMark Adams }); 784*8f7e8f9dSMark Adams // copy A into B 785*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 786*8f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 787*8f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 788*8f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 789*8f7e8f9dSMark Adams /* load in initial (unfactored row) */ 790*8f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 791*8f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 792*8f7e8f9dSMark Adams PetscScalar vala = av[j]; 793*8f7e8f9dSMark Adams if (colb == rowb) { 794*8f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 795*8f7e8f9dSMark Adams } else { 796*8f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 797*8f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 798*8f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 799*8f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 800*8f7e8f9dSMark Adams if (pbj[j] == colb) { 801*8f7e8f9dSMark Adams pba[j] = vala; 802*8f7e8f9dSMark Adams set++; 803*8f7e8f9dSMark Adams break; 804*8f7e8f9dSMark Adams } 805*8f7e8f9dSMark Adams } 806*8f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 807*8f7e8f9dSMark Adams } 808*8f7e8f9dSMark Adams } 809*8f7e8f9dSMark Adams }); 810*8f7e8f9dSMark Adams }); 811*8f7e8f9dSMark Adams Kokkos::fence(); 812930e68a5SMark Adams 813*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 814*8f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 815*8f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 816*8f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 817*8f7e8f9dSMark Adams const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda 818*8f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 819*8f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 820*8f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 821*8f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 822*8f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 823*8f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 824*8f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 825*8f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 826*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 827*8f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 828*8f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 829*8f7e8f9dSMark Adams else { 830*8f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 831*8f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 832*8f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 833*8f7e8f9dSMark Adams size_t st_idx; 834*8f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 835*8f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 836*8f7e8f9dSMark Adams // get column, there has got to be a better way 837*8f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 838*8f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 839*8f7e8f9dSMark Adams if (pjL[j] == ii) { 840*8f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 841*8f7e8f9dSMark Adams idx = j; // output 842*8f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 843*8f7e8f9dSMark Adams } 844*8f7e8f9dSMark Adams }, st_idx); 845*8f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 846*8f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 847*8f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 848*8f7e8f9dSMark Adams // U(i+1,:end) 849*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 850*8f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 851*8f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 852*8f7e8f9dSMark Adams if (col==myk) { 853*8f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 854*8f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 855*8f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 856*8f7e8f9dSMark Adams } else { 857*8f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 858*8f7e8f9dSMark Adams PetscInt high, low; 859*8f7e8f9dSMark Adams const PetscInt *startj; 860*8f7e8f9dSMark Adams if (col<myk) { // L 861*8f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 862*8f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 863*8f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 864*8f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 865*8f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 866*8f7e8f9dSMark Adams } else { 867*8f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 868*8f7e8f9dSMark Adams start = ba_d + idx; 869*8f7e8f9dSMark Adams startj= bj_d + idx; 870*8f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 871*8f7e8f9dSMark Adams } 872*8f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 873*8f7e8f9dSMark Adams low = 0; 874*8f7e8f9dSMark Adams high = (PetscInt)(end-start); 875*8f7e8f9dSMark Adams while (high-low > 5) { 876*8f7e8f9dSMark Adams int t = (low+high)/2; 877*8f7e8f9dSMark Adams if (startj[t] > col) high = t; 878*8f7e8f9dSMark Adams else low = t; 879*8f7e8f9dSMark Adams } 880*8f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 881*8f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 882*8f7e8f9dSMark Adams } 883*8f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 884*8f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 885*8f7e8f9dSMark Adams } 886*8f7e8f9dSMark Adams }); 887*8f7e8f9dSMark Adams } 888*8f7e8f9dSMark Adams } 889*8f7e8f9dSMark Adams }); 890*8f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 891*8f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 892*8f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 893*8f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 894*8f7e8f9dSMark Adams }); 895*8f7e8f9dSMark Adams Kokkos::fence(); 896*8f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 897*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 898*8f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 899*8f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 900*8f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 901*8f7e8f9dSMark Adams #endif 902*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 903*8f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 904*8f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 905*8f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 906*8f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 907*8f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 908*8f7e8f9dSMark Adams if (i < end) { 909*8f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 910*8f7e8f9dSMark Adams *pv = 1.0/(*pv); 911*8f7e8f9dSMark Adams } 912*8f7e8f9dSMark Adams }); 913*8f7e8f9dSMark Adams }); 914*8f7e8f9dSMark Adams } 915*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 916*8f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 917*8f7e8f9dSMark Adams #endif 918*8f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 919*8f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 920*8f7e8f9dSMark Adams 921*8f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 922*8f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 923*8f7e8f9dSMark Adams if (b->inode.size) { 924*8f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 925*8f7e8f9dSMark Adams } else if (row_identity && col_identity) { 926*8f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 927*8f7e8f9dSMark Adams } else { 928*8f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 929*8f7e8f9dSMark Adams } 930*8f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 931*8f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 932*8f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 933*8f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 934*8f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 935*8f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 936*8f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 937*8f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 938*8f7e8f9dSMark Adams 939930e68a5SMark Adams PetscFunctionReturn(0); 940930e68a5SMark Adams } 941930e68a5SMark Adams 942930e68a5SMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 943930e68a5SMark Adams { 944930e68a5SMark Adams PetscErrorCode ierr; 945930e68a5SMark Adams 946930e68a5SMark Adams PetscFunctionBegin; 947930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 948930e68a5SMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS; 949930e68a5SMark Adams PetscFunctionReturn(0); 950930e68a5SMark Adams } 951930e68a5SMark Adams 952*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 953*8f7e8f9dSMark Adams { 954*8f7e8f9dSMark Adams PetscErrorCode ierr; 955*8f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 956*8f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 957930e68a5SMark Adams 958*8f7e8f9dSMark Adams PetscFunctionBegin; 959*8f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 960*8f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 961*8f7e8f9dSMark Adams // move B data into Kokkos 962*8f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 963*8f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 964*8f7e8f9dSMark Adams { 965*8f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 966*8f7e8f9dSMark Adams if (!baijkok->diag_d) { 967*8f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 968*8f7e8f9dSMark Adams baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_diag)); 969*8f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 970*8f7e8f9dSMark Adams } 971*8f7e8f9dSMark Adams } 972*8f7e8f9dSMark Adams PetscFunctionReturn(0); 973*8f7e8f9dSMark Adams } 974*8f7e8f9dSMark Adams 975*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*); 976*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*); 977930e68a5SMark Adams 978930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 979930e68a5SMark Adams { 980930e68a5SMark Adams PetscErrorCode ierr; 981930e68a5SMark Adams 982930e68a5SMark Adams PetscFunctionBegin; 983930e68a5SMark Adams ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr); 984*8f7e8f9dSMark Adams ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 985930e68a5SMark Adams PetscFunctionReturn(0); 986930e68a5SMark Adams } 987930e68a5SMark Adams 988930e68a5SMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type) 989930e68a5SMark Adams { 990930e68a5SMark Adams PetscFunctionBegin; 991930e68a5SMark Adams *type = MATSOLVERKOKKOS; 992930e68a5SMark Adams PetscFunctionReturn(0); 993930e68a5SMark Adams } 994930e68a5SMark Adams 995*8f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice 996*8f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 997*8f7e8f9dSMark Adams { 998*8f7e8f9dSMark Adams PetscFunctionBegin; 999*8f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 1000*8f7e8f9dSMark Adams PetscFunctionReturn(0); 1001*8f7e8f9dSMark Adams } 1002*8f7e8f9dSMark Adams 1003930e68a5SMark Adams /*MC 1004930e68a5SMark Adams MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1005930e68a5SMark Adams on a single GPU of type, seqaijkokkos, aijkokkos. 1006930e68a5SMark Adams 1007930e68a5SMark Adams Level: beginner 1008930e68a5SMark Adams 1009930e68a5SMark Adams .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation 1010930e68a5SMark Adams M*/ 1011930e68a5SMark Adams 1012930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B) 1013930e68a5SMark Adams { 1014930e68a5SMark Adams PetscErrorCode ierr; 1015930e68a5SMark Adams PetscInt n = A->rmap->n; 1016930e68a5SMark Adams 1017930e68a5SMark Adams PetscFunctionBegin; 1018930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1019930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1020930e68a5SMark Adams (*B)->factortype = ftype; 1021930e68a5SMark Adams (*B)->useordering = PETSC_TRUE; 1022930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1023930e68a5SMark Adams 1024*8f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1025930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1026930e68a5SMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOS; 1027930e68a5SMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1028930e68a5SMark Adams 1029930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1030930e68a5SMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr); 1031930e68a5SMark Adams PetscFunctionReturn(0); 1032930e68a5SMark Adams } 1033*8f7e8f9dSMark Adams 1034*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1035*8f7e8f9dSMark Adams { 1036*8f7e8f9dSMark Adams PetscErrorCode ierr; 1037*8f7e8f9dSMark Adams PetscInt n = A->rmap->n; 1038*8f7e8f9dSMark Adams 1039*8f7e8f9dSMark Adams PetscFunctionBegin; 1040*8f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1041*8f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1042*8f7e8f9dSMark Adams (*B)->factortype = ftype; 1043*8f7e8f9dSMark Adams (*B)->useordering = PETSC_TRUE; 1044*8f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1045*8f7e8f9dSMark Adams 1046*8f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1047*8f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1048*8f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1049*8f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1050*8f7e8f9dSMark Adams 1051*8f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1052*8f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1053*8f7e8f9dSMark Adams PetscFunctionReturn(0); 1054*8f7e8f9dSMark Adams } 1055