1 #include <petscdevice_cuda.h> 2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3 4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5 struct PC_VPBJacobi_CUDA { 6 /* Cache the old sizes to check if we need realloc */ 7 PetscInt n; /* number of rows of the local matrix */ 8 PetscInt nblocks; /* number of point blocks */ 9 PetscInt nsize; /* sum of sizes of the point blocks */ 10 11 /* Helper arrays that are pre-computed on host and then copied to device. 12 bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 13 bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize. 14 matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15 */ 16 PetscInt *bs_h, *bs2_h, *matIdx_h; 17 PetscInt *bs_d, *bs2_d, *matIdx_d; 18 19 MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20 21 PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize) 22 { 23 /* malloc memory on host and device, and then update */ 24 PetscCallVoid(PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h)); 25 PetscCallCUDAVoid(cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1))); 26 PetscCallCUDAVoid(cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1))); 27 PetscCallCUDAVoid(cudaMalloc(&matIdx_d, sizeof(PetscInt) * n)); 28 PetscCallCUDAVoid(cudaMalloc(&diag_d, sizeof(MatScalar) * nsize)); 29 PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_h)); 30 } 31 32 PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h) 33 { 34 PetscFunctionBegin; 35 PetscCall(ComputeOffsetsOnHost(bsizes)); 36 PetscCallCUDA(cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice)); 37 PetscCallCUDA(cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice)); 38 PetscCallCUDA(cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 39 PetscCallCUDA(cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice)); 40 PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize)); 41 PetscFunctionReturn(0); 42 } 43 44 ~PC_VPBJacobi_CUDA() 45 { 46 PetscCallVoid(PetscFree3(bs_h, bs2_h, matIdx_h)); 47 PetscCallCUDAVoid(cudaFree(bs_d)); 48 PetscCallCUDAVoid(cudaFree(bs2_d)); 49 PetscCallCUDAVoid(cudaFree(matIdx_d)); 50 PetscCallCUDAVoid(cudaFree(diag_d)); 51 } 52 53 private: 54 PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 55 { 56 PetscFunctionBegin; 57 bs_h[0] = bs2_h[0] = 0; 58 for (PetscInt i = 0; i < nblocks; i++) { 59 bs_h[i + 1] = bs_h[i] + bsizes[i]; 60 bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i]; 61 for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i; 62 } 63 PetscFunctionReturn(0); 64 } 65 }; 66 67 /* Like cublasDgemvBatched() but with variable-sized blocks 68 69 Input Parameters: 70 + n - number of rows of the local matrix 71 . bs - [nblocks+1], prefix sum of bsizes[] 72 . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 73 . matIdx - [n], store block/matrix index for each row 74 . A - blocks of the matrix back to back in column-major order 75 - x - the input vector 76 77 Output Parameter: 78 . y - the output vector 79 */ 80 __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y) 81 { 82 const PetscInt gridSize = gridDim.x * blockDim.x; 83 PetscInt tid = blockIdx.x * blockDim.x + threadIdx.x; 84 PetscInt i, j, k, m; 85 86 /* One row per thread. The blocks/matrices are stored in column-major order */ 87 for (; tid < n; tid += gridSize) { 88 k = matIdx[tid]; /* k-th block */ 89 m = bs[k + 1] - bs[k]; /* block size of the k-th block */ 90 i = tid - bs[k]; /* i-th row of the block */ 91 A += bs2[k] + i; /* advance A to the first entry of i-th row */ 92 x += bs[k]; 93 y += bs[k]; 94 95 y[i] = 0.0; 96 for (j = 0; j < m; j++) { 97 y[i] += A[0] * x[j]; 98 A += m; 99 } 100 } 101 } 102 103 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y) 104 { 105 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 106 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr); 107 const PetscScalar *xx; 108 PetscScalar *yy; 109 PetscInt n; 110 111 PetscFunctionBegin; 112 PetscCall(PetscLogGpuTimeBegin()); 113 if (PetscDefined(USE_DEBUG)) { 114 PetscBool isCuda; 115 PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, "")); 116 if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, "")); 117 PetscCheck(isCuda, PETSC_COMM_SELF, PETSC_ERR_SUP, "PC: applying a CUDA pmat to non-cuda vectors"); 118 } 119 120 PetscCall(MatGetLocalSize(pc->pmat, &n, NULL)); 121 if (n) { 122 PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */ 123 PetscCall(VecCUDAGetArrayRead(x, &xx)); 124 PetscCall(VecCUDAGetArrayWrite(y, &yy)); 125 MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy); 126 PetscCallCUDA(cudaGetLastError()); 127 PetscCall(VecCUDARestoreArrayRead(x, &xx)); 128 PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 129 } 130 PetscCall(PetscLogGpuFlops(pcuda->nsize * 2)); /* FMA on entries in all blocks */ 131 PetscCall(PetscLogGpuTimeEnd()); 132 PetscFunctionReturn(0); 133 } 134 135 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 136 { 137 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 138 139 PetscFunctionBegin; 140 PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr)); 141 PCDestroy_VPBJacobi(pc); 142 PetscFunctionReturn(0); 143 } 144 145 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 146 { 147 PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 148 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr); 149 PetscInt i, n, nblocks, nsize = 0; 150 const PetscInt *bsizes; 151 152 PetscFunctionBegin; 153 PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 154 PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes)); 155 for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i]; 156 PetscCall(MatGetLocalSize(pc->pmat, &n, NULL)); 157 158 /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 159 if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 160 PetscCallCXX(delete pcuda); 161 pcuda = nullptr; 162 } 163 164 if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */ 165 PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag)); 166 } else { /* update the value only */ 167 PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag)); 168 } 169 170 pc->ops->apply = PCApply_VPBJacobi_CUDA; 171 pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 172 PetscFunctionReturn(0); 173 } 174