112facf1bSJunchao Zhang #include <petscdevice_cuda.h> 212facf1bSJunchao Zhang #include <petsc/private/petsclegacycupmblas.h> 312facf1bSJunchao Zhang #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h> 412facf1bSJunchao Zhang 512facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0) 612facf1bSJunchao Zhang __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose) 712facf1bSJunchao Zhang { 812facf1bSJunchao Zhang const PetscInt gridSize = gridDim.x * blockDim.x; 912facf1bSJunchao Zhang PetscInt row = blockIdx.x * blockDim.x + threadIdx.x; 1012facf1bSJunchao Zhang const PetscInt bs2 = bs * bs; 1112facf1bSJunchao Zhang 1212facf1bSJunchao Zhang /* One row per thread. The blocks are stored in column-major order */ 1312facf1bSJunchao Zhang for (; row < bs * mbs; row += gridSize) { 1412facf1bSJunchao Zhang const PetscScalar *Ap, *xp; 1512facf1bSJunchao Zhang PetscScalar *yp; 1612facf1bSJunchao Zhang PetscInt i, j, k; 1712facf1bSJunchao Zhang 1812facf1bSJunchao Zhang k = row / bs; /* k-th block */ 1912facf1bSJunchao Zhang i = row % bs; /* this thread deals with i-th row of the block */ 2012facf1bSJunchao Zhang Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */ 2112facf1bSJunchao Zhang xp = &x[bs * k]; 2212facf1bSJunchao Zhang yp = &y[bs * k]; 2312facf1bSJunchao Zhang /* multiply i-th row (column) with x */ 2412facf1bSJunchao Zhang yp[i] = 0.0; 2512facf1bSJunchao Zhang for (j = 0; j < bs; j++) { 2612facf1bSJunchao Zhang yp[i] += Ap[0] * xp[j]; 2712facf1bSJunchao Zhang Ap += (transpose ? 1 : bs); /* block is in column major order */ 2812facf1bSJunchao Zhang } 2912facf1bSJunchao Zhang } 3012facf1bSJunchao Zhang } 3112facf1bSJunchao Zhang #endif 3212facf1bSJunchao Zhang 3312facf1bSJunchao Zhang static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op) 3412facf1bSJunchao Zhang { 3512facf1bSJunchao Zhang const PetscScalar *xx; 3612facf1bSJunchao Zhang PetscScalar *yy; 3712facf1bSJunchao Zhang cublasHandle_t handle; 3812facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 3912facf1bSJunchao Zhang const PetscScalar *A = (const PetscScalar *)jac->spptr; 4012facf1bSJunchao Zhang const PetscInt bs = jac->bs, mbs = jac->mbs; 4112facf1bSJunchao Zhang 4212facf1bSJunchao Zhang PetscFunctionBegin; 4312facf1bSJunchao Zhang PetscCall(VecCUDAGetArrayRead(x, &xx)); 4412facf1bSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(y, &yy)); 4512facf1bSJunchao Zhang PetscCall(PetscCUBLASGetHandle(&handle)); 4612facf1bSJunchao Zhang PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */ 4712facf1bSJunchao Zhang 4812facf1bSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0) 4912facf1bSJunchao Zhang /* y = alpha op(A) x + beta y */ 5012facf1bSJunchao Zhang const PetscScalar alpha = 1.0, beta = 0.0; 5112facf1bSJunchao Zhang PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs)); 5212facf1bSJunchao Zhang #else 5312facf1bSJunchao Zhang PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */ 5412facf1bSJunchao Zhang MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, (op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE)); 5512facf1bSJunchao Zhang PetscCallCUDA(cudaGetLastError()); 5612facf1bSJunchao Zhang #endif 5712facf1bSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(x, &xx)); 5812facf1bSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 5912facf1bSJunchao Zhang PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2)); 60*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6112facf1bSJunchao Zhang } 6212facf1bSJunchao Zhang 6312facf1bSJunchao Zhang static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y) 6412facf1bSJunchao Zhang { 6512facf1bSJunchao Zhang PetscFunctionBegin; 6612facf1bSJunchao Zhang PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose 67*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6812facf1bSJunchao Zhang } 6912facf1bSJunchao Zhang 7012facf1bSJunchao Zhang static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y) 7112facf1bSJunchao Zhang { 7212facf1bSJunchao Zhang PetscFunctionBegin; 7312facf1bSJunchao Zhang PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose 74*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7512facf1bSJunchao Zhang } 7612facf1bSJunchao Zhang 7712facf1bSJunchao Zhang static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc) 7812facf1bSJunchao Zhang { 7912facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 8012facf1bSJunchao Zhang 8112facf1bSJunchao Zhang PetscFunctionBegin; 8212facf1bSJunchao Zhang PetscCallCUDA(cudaFree(jac->spptr)); 8312facf1bSJunchao Zhang PetscCall(PCDestroy_PBJacobi(pc)); 84*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 8512facf1bSJunchao Zhang } 8612facf1bSJunchao Zhang 8712facf1bSJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc) 8812facf1bSJunchao Zhang { 8912facf1bSJunchao Zhang PC_PBJacobi *jac = (PC_PBJacobi *)pc->data; 9012facf1bSJunchao Zhang size_t size; 9112facf1bSJunchao Zhang 9212facf1bSJunchao Zhang PetscFunctionBegin; 9312facf1bSJunchao Zhang PetscCall(PCSetUp_PBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 9412facf1bSJunchao Zhang size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs; 9512facf1bSJunchao Zhang 9612facf1bSJunchao Zhang /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */ 9712facf1bSJunchao Zhang if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size)); 9812facf1bSJunchao Zhang PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice)); 9912facf1bSJunchao Zhang PetscCall(PetscLogCpuToGpu(size)); 10012facf1bSJunchao Zhang 10112facf1bSJunchao Zhang pc->ops->apply = PCApply_PBJacobi_CUDA; 10212facf1bSJunchao Zhang pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA; 10312facf1bSJunchao Zhang pc->ops->destroy = PCDestroy_PBJacobi_CUDA; 104*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10512facf1bSJunchao Zhang } 106