1 #include <petscvec_kokkos.hpp> 2 #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp> 3 #include <petscdevice.h> 4 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 5 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> // for MatInvertVariableBlockDiagonal_SeqAIJKokkos 6 7 /* A class that manages helper arrays assisting parallel PCApply() with Kokkos */ 8 struct PC_VPBJacobi_Kokkos { 9 /* Cache the old sizes to check if we need realloc */ 10 PetscInt n; /* number of rows of the local matrix */ 11 PetscInt nblocks; /* number of point blocks */ 12 PetscInt nsize; /* sum of sizes (elements) of the point blocks */ 13 14 /* Helper arrays that are pre-computed on host and then copied to device. 15 bs: [nblocks+1], "csr" version of bsizes[] 16 bs2: [nblocks+1], "csr" version of squares of bsizes[] 17 blkMap: [n], row i of the local matrix belongs to the blkMap[i]-th block 18 */ 19 PetscIntKokkosDualView bs_dual, bs2_dual, blkMap_dual; 20 PetscScalarKokkosView diag; // buffer to store diagonal blocks 21 PetscScalarKokkosView work; // work buffer, with the same size as diag[] 22 PetscLogDouble setupFlops; 23 24 // clang-format off 25 // n: size of the matrix 26 // nblocks: number of blocks 27 // nsize: sum bsizes[i]^2 for i=0..nblocks 28 // bsizes[nblocks]: sizes of blocks 29 PC_VPBJacobi_Kokkos(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes) : 30 n(n), nblocks(nblocks), nsize(nsize), bs_dual(NoInit("bs_dual"), nblocks + 1), 31 bs2_dual(NoInit("bs2_dual"), nblocks + 1), blkMap_dual(NoInit("blkMap_dual"), n), 32 diag(NoInit("diag"), nsize), work(NoInit("work"), nsize) 33 { 34 PetscCallVoid(BuildHelperArrays(bsizes)); 35 } 36 // clang-format on 37 38 private: 39 PetscErrorCode BuildHelperArrays(const PetscInt *bsizes) 40 { 41 PetscInt *bs_h = bs_dual.view_host().data(); 42 PetscInt *bs2_h = bs2_dual.view_host().data(); 43 PetscInt *blkMap_h = blkMap_dual.view_host().data(); 44 45 PetscFunctionBegin; 46 setupFlops = 0.0; 47 bs_h[0] = bs2_h[0] = 0; 48 for (PetscInt i = 0; i < nblocks; i++) { 49 PetscInt m = bsizes[i]; 50 bs_h[i + 1] = bs_h[i] + m; 51 bs2_h[i + 1] = bs2_h[i] + m * m; 52 for (PetscInt j = 0; j < m; j++) blkMap_h[bs_h[i] + j] = i; 53 // m^3/3 FMA for A=LU factorization; m^3 FMA for solving (LU)X=I to get the inverse 54 setupFlops += 8.0 * m * m * m / 3; 55 } 56 57 PetscCallCXX(bs_dual.modify_host()); 58 PetscCallCXX(bs2_dual.modify_host()); 59 PetscCallCXX(blkMap_dual.modify_host()); 60 PetscCallCXX(bs_dual.sync_device()); 61 PetscCallCXX(bs2_dual.sync_device()); 62 PetscCallCXX(blkMap_dual.sync_device()); 63 PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * (nblocks + 1) + n))); 64 PetscFunctionReturn(PETSC_SUCCESS); 65 } 66 }; 67 68 template <PetscBool transpose> 69 static PetscErrorCode PCApplyOrTranspose_VPBJacobi_Kokkos(PC pc, Vec x, Vec y) 70 { 71 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 72 PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr); 73 ConstPetscScalarKokkosView xv; 74 PetscScalarKokkosView yv; 75 PetscScalarKokkosView diag = pckok->diag; 76 PetscIntKokkosView bs = pckok->bs_dual.view_device(); 77 PetscIntKokkosView bs2 = pckok->bs2_dual.view_device(); 78 PetscIntKokkosView blkMap = pckok->blkMap_dual.view_device(); 79 const char *label = transpose ? "PCApplyTranspose_VPBJacobi_Kokkos" : "PCApply_VPBJacobi_Kokkos"; 80 81 PetscFunctionBegin; 82 PetscCall(PetscLogGpuTimeBegin()); 83 VecErrorIfNotKokkos(x); 84 VecErrorIfNotKokkos(y); 85 PetscCall(VecGetKokkosView(x, &xv)); 86 PetscCall(VecGetKokkosViewWrite(y, &yv)); 87 PetscCallCXX(Kokkos::parallel_for( 88 label, pckok->n, KOKKOS_LAMBDA(PetscInt row) { 89 const PetscScalar *Ap, *xp; 90 PetscScalar *yp; 91 PetscInt i, j, k, m; 92 93 k = blkMap(row); /* k-th block/matrix */ 94 m = bs(k + 1) - bs(k); /* block size of the k-th block */ 95 i = row - bs(k); /* i-th row of the block */ 96 Ap = &diag(bs2(k) + i * (transpose ? m : 1)); /* Ap points to the first entry of i-th row/column */ 97 xp = &xv(bs(k)); 98 yp = &yv(bs(k)); 99 100 yp[i] = 0.0; 101 for (j = 0; j < m; j++) { 102 yp[i] += Ap[0] * xp[j]; 103 Ap += transpose ? 1 : m; 104 } 105 })); 106 PetscCall(VecRestoreKokkosView(x, &xv)); 107 PetscCall(VecRestoreKokkosViewWrite(y, &yv)); 108 PetscCall(PetscLogGpuFlops(pckok->nsize * 2)); /* FMA on entries in all blocks */ 109 PetscCall(PetscLogGpuTimeEnd()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode PCDestroy_VPBJacobi_Kokkos(PC pc) 114 { 115 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 116 117 PetscFunctionBegin; 118 PetscCallCXX(delete static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr)); 119 PetscCall(PCDestroy_VPBJacobi(pc)); 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_Kokkos(PC pc) 124 { 125 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 126 PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr); 127 PetscInt i, nlocal, nblocks, nsize = 0; 128 const PetscInt *bsizes; 129 130 PetscFunctionBegin; 131 PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes)); 132 PetscCall(MatGetLocalSize(pc->pmat, &nlocal, NULL)); 133 PetscCheck(!nlocal || nblocks, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call MatSetVariableBlockSizes() before using PCVPBJACOBI"); 134 135 if (!jac->diag) { 136 PetscInt max_bs = -1, min_bs = PETSC_MAX_INT; 137 for (i = 0; i < nblocks; i++) { 138 min_bs = PetscMin(min_bs, bsizes[i]); 139 max_bs = PetscMax(max_bs, bsizes[i]); 140 nsize += bsizes[i] * bsizes[i]; 141 } 142 jac->nblocks = nblocks; 143 jac->min_bs = min_bs; 144 jac->max_bs = max_bs; 145 } 146 147 // If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway 148 if (pckok && (pckok->n != nlocal || pckok->nblocks != nblocks || pckok->nsize != nsize)) { 149 PetscCallCXX(delete pckok); 150 pckok = nullptr; 151 } 152 153 PetscCall(PetscLogGpuTimeBegin()); 154 if (!pckok) { 155 PetscCallCXX(pckok = new PC_VPBJacobi_Kokkos(nlocal, nblocks, nsize, bsizes)); 156 jac->spptr = pckok; 157 } 158 159 // Extract diagonal blocks from the matrix and compute their inverse 160 const auto &bs = pckok->bs_dual.view_device(); 161 const auto &bs2 = pckok->bs2_dual.view_device(); 162 const auto &blkMap = pckok->blkMap_dual.view_device(); 163 PetscCall(MatInvertVariableBlockDiagonal_SeqAIJKokkos(pc->pmat, bs, bs2, blkMap, pckok->work, pckok->diag)); 164 pc->ops->apply = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_FALSE>; 165 pc->ops->applytranspose = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_TRUE>; 166 pc->ops->destroy = PCDestroy_VPBJacobi_Kokkos; 167 PetscCall(PetscLogGpuTimeEnd()); 168 PetscCall(PetscLogGpuFlops(pckok->setupFlops)); 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171