10e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h> 2f1be3500SJunchao Zhang #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3f1be3500SJunchao Zhang 4f1be3500SJunchao Zhang /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5f1be3500SJunchao Zhang struct PC_VPBJacobi_CUDA { 6f1be3500SJunchao Zhang /* Cache the old sizes to check if we need realloc */ 7f1be3500SJunchao Zhang PetscInt n; /* number of rows of the local matrix */ 8f1be3500SJunchao Zhang PetscInt nblocks; /* number of point blocks */ 9f1be3500SJunchao Zhang PetscInt nsize; /* sum of sizes of the point blocks */ 10f1be3500SJunchao Zhang 11f1be3500SJunchao Zhang /* Helper arrays that are pre-computed on host and then copied to device. 12f1be3500SJunchao Zhang bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 139a56b474SJunchao Zhang bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize. 14f1be3500SJunchao Zhang matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15f1be3500SJunchao Zhang */ 16f1be3500SJunchao Zhang PetscInt *bs_h, *bs2_h, *matIdx_h; 17f1be3500SJunchao Zhang PetscInt *bs_d, *bs2_d, *matIdx_d; 18f1be3500SJunchao Zhang 19f1be3500SJunchao Zhang MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20f1be3500SJunchao Zhang 21d71ae5a4SJacob Faibussowitsch PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize) 22d71ae5a4SJacob Faibussowitsch { 23f1be3500SJunchao Zhang /* malloc memory on host and device, and then update */ 24f1be3500SJunchao Zhang PetscCallVoid(PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h)); 25f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1))); 26f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1))); 27f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&matIdx_d, sizeof(PetscInt) * n)); 28f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&diag_d, sizeof(MatScalar) * nsize)); 29f1be3500SJunchao Zhang PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_h)); 30f1be3500SJunchao Zhang } 31f1be3500SJunchao Zhang 32d71ae5a4SJacob Faibussowitsch PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h) 33d71ae5a4SJacob Faibussowitsch { 34f1be3500SJunchao Zhang PetscFunctionBegin; 35f1be3500SJunchao Zhang PetscCall(ComputeOffsetsOnHost(bsizes)); 36f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice)); 37f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice)); 38f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 39f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice)); 409a56b474SJunchao Zhang PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize)); 41f1be3500SJunchao Zhang PetscFunctionReturn(0); 42f1be3500SJunchao Zhang } 43f1be3500SJunchao Zhang 44d71ae5a4SJacob Faibussowitsch ~PC_VPBJacobi_CUDA() 45d71ae5a4SJacob Faibussowitsch { 46f1be3500SJunchao Zhang PetscCallVoid(PetscFree3(bs_h, bs2_h, matIdx_h)); 47f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs_d)); 48f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs2_d)); 49f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(matIdx_d)); 50f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(diag_d)); 51f1be3500SJunchao Zhang } 52f1be3500SJunchao Zhang 53f1be3500SJunchao Zhang private: 54d71ae5a4SJacob Faibussowitsch PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 55d71ae5a4SJacob Faibussowitsch { 56f1be3500SJunchao Zhang PetscFunctionBegin; 57f1be3500SJunchao Zhang bs_h[0] = bs2_h[0] = 0; 58f1be3500SJunchao Zhang for (PetscInt i = 0; i < nblocks; i++) { 59f1be3500SJunchao Zhang bs_h[i + 1] = bs_h[i] + bsizes[i]; 60f1be3500SJunchao Zhang bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i]; 61f1be3500SJunchao Zhang for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i; 62f1be3500SJunchao Zhang } 63f1be3500SJunchao Zhang PetscFunctionReturn(0); 64f1be3500SJunchao Zhang } 65f1be3500SJunchao Zhang }; 66f1be3500SJunchao Zhang 67f1be3500SJunchao Zhang /* Like cublasDgemvBatched() but with variable-sized blocks 68f1be3500SJunchao Zhang 69f1be3500SJunchao Zhang Input Parameters: 70f1be3500SJunchao Zhang + n - number of rows of the local matrix 71f1be3500SJunchao Zhang . bs - [nblocks+1], prefix sum of bsizes[] 72f1be3500SJunchao Zhang . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 73f1be3500SJunchao Zhang . matIdx - [n], store block/matrix index for each row 74f1be3500SJunchao Zhang . A - blocks of the matrix back to back in column-major order 75*69eda9daSJed Brown . x - the input vector 76*69eda9daSJed Brown - transpose - whether it is MatMult for Ax (false) or MatMultTranspose for A^Tx (true) 77f1be3500SJunchao Zhang 78f1be3500SJunchao Zhang Output Parameter: 79f1be3500SJunchao Zhang . y - the output vector 80f1be3500SJunchao Zhang */ 81*69eda9daSJed Brown __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose) 82d71ae5a4SJacob Faibussowitsch { 83f1be3500SJunchao Zhang const PetscInt gridSize = gridDim.x * blockDim.x; 84f1be3500SJunchao Zhang PetscInt tid = blockIdx.x * blockDim.x + threadIdx.x; 85f1be3500SJunchao Zhang PetscInt i, j, k, m; 86f1be3500SJunchao Zhang 87f1be3500SJunchao Zhang /* One row per thread. The blocks/matrices are stored in column-major order */ 88f1be3500SJunchao Zhang for (; tid < n; tid += gridSize) { 89f1be3500SJunchao Zhang k = matIdx[tid]; /* k-th block */ 90f1be3500SJunchao Zhang m = bs[k + 1] - bs[k]; /* block size of the k-th block */ 91f1be3500SJunchao Zhang i = tid - bs[k]; /* i-th row of the block */ 92*69eda9daSJed Brown A += bs2[k] + i * (transpose ? m : 1); /* advance A to the first entry of i-th row */ 93f1be3500SJunchao Zhang x += bs[k]; 94f1be3500SJunchao Zhang y += bs[k]; 95f1be3500SJunchao Zhang 96f1be3500SJunchao Zhang y[i] = 0.0; 979371c9d4SSatish Balay for (j = 0; j < m; j++) { 989371c9d4SSatish Balay y[i] += A[0] * x[j]; 99*69eda9daSJed Brown A += (transpose ? 1 : m); 1009371c9d4SSatish Balay } 101f1be3500SJunchao Zhang } 102f1be3500SJunchao Zhang } 103f1be3500SJunchao Zhang 104*69eda9daSJed Brown static PetscErrorCode PCApplyOrTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y, PetscBool transpose) 105d71ae5a4SJacob Faibussowitsch { 106f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 107f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr); 108f1be3500SJunchao Zhang const PetscScalar *xx; 109f1be3500SJunchao Zhang PetscScalar *yy; 110f1be3500SJunchao Zhang PetscInt n; 111f1be3500SJunchao Zhang 112f1be3500SJunchao Zhang PetscFunctionBegin; 1139a56b474SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 114f1be3500SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 115f1be3500SJunchao Zhang PetscBool isCuda; 116f1be3500SJunchao Zhang PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, "")); 117f1be3500SJunchao Zhang if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, "")); 118f1be3500SJunchao Zhang PetscCheck(isCuda, PETSC_COMM_SELF, PETSC_ERR_SUP, "PC: applying a CUDA pmat to non-cuda vectors"); 119f1be3500SJunchao Zhang } 120f1be3500SJunchao Zhang 121f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat, &n, NULL)); 122f1be3500SJunchao Zhang if (n) { 123f1be3500SJunchao Zhang PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */ 124f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayRead(x, &xx)); 125f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(y, &yy)); 126*69eda9daSJed Brown MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy, transpose); 127f1be3500SJunchao Zhang PetscCallCUDA(cudaGetLastError()); 128f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(x, &xx)); 129f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 130f1be3500SJunchao Zhang } 1319a56b474SJunchao Zhang PetscCall(PetscLogGpuFlops(pcuda->nsize * 2)); /* FMA on entries in all blocks */ 1329a56b474SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 133f1be3500SJunchao Zhang PetscFunctionReturn(0); 134f1be3500SJunchao Zhang } 135f1be3500SJunchao Zhang 136*69eda9daSJed Brown static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y) 137*69eda9daSJed Brown { 138*69eda9daSJed Brown PetscFunctionBegin; 139*69eda9daSJed Brown PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_FALSE)); 140*69eda9daSJed Brown PetscFunctionReturn(0); 141*69eda9daSJed Brown } 142*69eda9daSJed Brown 143*69eda9daSJed Brown static PetscErrorCode PCApplyTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y) 144*69eda9daSJed Brown { 145*69eda9daSJed Brown PetscFunctionBegin; 146*69eda9daSJed Brown PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_TRUE)); 147*69eda9daSJed Brown PetscFunctionReturn(0); 148*69eda9daSJed Brown } 149*69eda9daSJed Brown 150d71ae5a4SJacob Faibussowitsch static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 151d71ae5a4SJacob Faibussowitsch { 152f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 153f1be3500SJunchao Zhang 154f1be3500SJunchao Zhang PetscFunctionBegin; 155f1be3500SJunchao Zhang PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr)); 156f1be3500SJunchao Zhang PCDestroy_VPBJacobi(pc); 157f1be3500SJunchao Zhang PetscFunctionReturn(0); 158f1be3500SJunchao Zhang } 159f1be3500SJunchao Zhang 160d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 161d71ae5a4SJacob Faibussowitsch { 162f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data; 163f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr); 164f1be3500SJunchao Zhang PetscInt i, n, nblocks, nsize = 0; 165f1be3500SJunchao Zhang const PetscInt *bsizes; 166f1be3500SJunchao Zhang 167f1be3500SJunchao Zhang PetscFunctionBegin; 168f1be3500SJunchao Zhang PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 169f1be3500SJunchao Zhang PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes)); 170f1be3500SJunchao Zhang for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i]; 171f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat, &n, NULL)); 172f1be3500SJunchao Zhang 173f1be3500SJunchao Zhang /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 174f1be3500SJunchao Zhang if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 175f1be3500SJunchao Zhang PetscCallCXX(delete pcuda); 176f1be3500SJunchao Zhang pcuda = nullptr; 177f1be3500SJunchao Zhang } 178f1be3500SJunchao Zhang 179f1be3500SJunchao Zhang if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */ 180f1be3500SJunchao Zhang PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag)); 181f1be3500SJunchao Zhang } else { /* update the value only */ 182f1be3500SJunchao Zhang PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag)); 183f1be3500SJunchao Zhang } 184f1be3500SJunchao Zhang 185f1be3500SJunchao Zhang pc->ops->apply = PCApply_VPBJacobi_CUDA; 186*69eda9daSJed Brown pc->ops->applytranspose = PCApplyTranspose_VPBJacobi_CUDA; 187f1be3500SJunchao Zhang pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 188f1be3500SJunchao Zhang PetscFunctionReturn(0); 189f1be3500SJunchao Zhang } 190