1*f1be3500SJunchao Zhang #include <petscdevice.h> 2*f1be3500SJunchao Zhang #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3*f1be3500SJunchao Zhang 4*f1be3500SJunchao Zhang /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5*f1be3500SJunchao Zhang struct PC_VPBJacobi_CUDA { 6*f1be3500SJunchao Zhang /* Cache the old sizes to check if we need realloc */ 7*f1be3500SJunchao Zhang PetscInt n; /* number of rows of the local matrix */ 8*f1be3500SJunchao Zhang PetscInt nblocks; /* number of point blocks */ 9*f1be3500SJunchao Zhang PetscInt nsize; /* sum of sizes of the point blocks */ 10*f1be3500SJunchao Zhang 11*f1be3500SJunchao Zhang /* Helper arrays that are pre-computed on host and then copied to device. 12*f1be3500SJunchao Zhang bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 13*f1be3500SJunchao Zhang bs2: [nblocks+1], "csr" version of squares of bsizes[] 14*f1be3500SJunchao Zhang matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15*f1be3500SJunchao Zhang */ 16*f1be3500SJunchao Zhang PetscInt *bs_h,*bs2_h,*matIdx_h; 17*f1be3500SJunchao Zhang PetscInt *bs_d,*bs2_d,*matIdx_d; 18*f1be3500SJunchao Zhang 19*f1be3500SJunchao Zhang MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20*f1be3500SJunchao Zhang 21*f1be3500SJunchao Zhang PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h) 22*f1be3500SJunchao Zhang : n(n),nblocks(nblocks),nsize(nsize) 23*f1be3500SJunchao Zhang { 24*f1be3500SJunchao Zhang /* malloc memory on host and device, and then update */ 25*f1be3500SJunchao Zhang PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h)); 26*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1))); 27*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1))); 28*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n)); 29*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize)); 30*f1be3500SJunchao Zhang PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h)); 31*f1be3500SJunchao Zhang } 32*f1be3500SJunchao Zhang 33*f1be3500SJunchao Zhang PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h) 34*f1be3500SJunchao Zhang { 35*f1be3500SJunchao Zhang PetscFunctionBegin; 36*f1be3500SJunchao Zhang PetscCall(ComputeOffsetsOnHost(bsizes)); 37*f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 38*f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 39*f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 40*f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice)); 41*f1be3500SJunchao Zhang PetscFunctionReturn(0); 42*f1be3500SJunchao Zhang } 43*f1be3500SJunchao Zhang 44*f1be3500SJunchao Zhang ~PC_VPBJacobi_CUDA() 45*f1be3500SJunchao Zhang { 46*f1be3500SJunchao Zhang PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h)); 47*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs_d)); 48*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs2_d)); 49*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(matIdx_d)); 50*f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(diag_d)); 51*f1be3500SJunchao Zhang } 52*f1be3500SJunchao Zhang 53*f1be3500SJunchao Zhang private: 54*f1be3500SJunchao Zhang PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 55*f1be3500SJunchao Zhang { 56*f1be3500SJunchao Zhang PetscFunctionBegin; 57*f1be3500SJunchao Zhang bs_h[0] = bs2_h[0] = 0; 58*f1be3500SJunchao Zhang for (PetscInt i=0; i<nblocks; i++) { 59*f1be3500SJunchao Zhang bs_h[i+1] = bs_h[i] + bsizes[i]; 60*f1be3500SJunchao Zhang bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i]; 61*f1be3500SJunchao Zhang for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i; 62*f1be3500SJunchao Zhang } 63*f1be3500SJunchao Zhang PetscFunctionReturn(0); 64*f1be3500SJunchao Zhang } 65*f1be3500SJunchao Zhang }; 66*f1be3500SJunchao Zhang 67*f1be3500SJunchao Zhang /* Like cublasDgemvBatched() but with variable-sized blocks 68*f1be3500SJunchao Zhang 69*f1be3500SJunchao Zhang Input Parameters: 70*f1be3500SJunchao Zhang + n - number of rows of the local matrix 71*f1be3500SJunchao Zhang . bs - [nblocks+1], prefix sum of bsizes[] 72*f1be3500SJunchao Zhang . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 73*f1be3500SJunchao Zhang . matIdx - [n], store block/matrix index for each row 74*f1be3500SJunchao Zhang . A - blocks of the matrix back to back in column-major order 75*f1be3500SJunchao Zhang - x - the input vector 76*f1be3500SJunchao Zhang 77*f1be3500SJunchao Zhang Output Parameter: 78*f1be3500SJunchao Zhang . y - the output vector 79*f1be3500SJunchao Zhang */ 80*f1be3500SJunchao Zhang __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y) 81*f1be3500SJunchao Zhang { 82*f1be3500SJunchao Zhang const PetscInt gridSize = gridDim.x * blockDim.x; 83*f1be3500SJunchao Zhang PetscInt tid = blockIdx.x*blockDim.x + threadIdx.x; 84*f1be3500SJunchao Zhang PetscInt i,j,k,m; 85*f1be3500SJunchao Zhang 86*f1be3500SJunchao Zhang /* One row per thread. The blocks/matrices are stored in column-major order */ 87*f1be3500SJunchao Zhang for (; tid<n; tid += gridSize) { 88*f1be3500SJunchao Zhang k = matIdx[tid]; /* k-th block */ 89*f1be3500SJunchao Zhang m = bs[k+1] - bs[k]; /* block size of the k-th block */ 90*f1be3500SJunchao Zhang i = tid - bs[k]; /* i-th row of the block */ 91*f1be3500SJunchao Zhang A += bs2[k] + i; /* advance A to the first entry of i-th row */ 92*f1be3500SJunchao Zhang x += bs[k]; 93*f1be3500SJunchao Zhang y += bs[k]; 94*f1be3500SJunchao Zhang 95*f1be3500SJunchao Zhang y[i] = 0.0; 96*f1be3500SJunchao Zhang for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;} 97*f1be3500SJunchao Zhang } 98*f1be3500SJunchao Zhang } 99*f1be3500SJunchao Zhang 100*f1be3500SJunchao Zhang static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y) 101*f1be3500SJunchao Zhang { 102*f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 103*f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 104*f1be3500SJunchao Zhang const PetscScalar *xx; 105*f1be3500SJunchao Zhang PetscScalar *yy; 106*f1be3500SJunchao Zhang PetscInt n; 107*f1be3500SJunchao Zhang 108*f1be3500SJunchao Zhang PetscFunctionBegin; 109*f1be3500SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 110*f1be3500SJunchao Zhang PetscBool isCuda; 111*f1be3500SJunchao Zhang PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 112*f1be3500SJunchao Zhang if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 113*f1be3500SJunchao Zhang PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors"); 114*f1be3500SJunchao Zhang } 115*f1be3500SJunchao Zhang 116*f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 117*f1be3500SJunchao Zhang if (n) { 118*f1be3500SJunchao Zhang PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */ 119*f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayRead(x,&xx)); 120*f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(y,&yy)); 121*f1be3500SJunchao Zhang MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy); 122*f1be3500SJunchao Zhang PetscCallCUDA(cudaGetLastError()); 123*f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(x,&xx)); 124*f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 125*f1be3500SJunchao Zhang } 126*f1be3500SJunchao Zhang PetscFunctionReturn(0); 127*f1be3500SJunchao Zhang } 128*f1be3500SJunchao Zhang 129*f1be3500SJunchao Zhang static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 130*f1be3500SJunchao Zhang { 131*f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 132*f1be3500SJunchao Zhang 133*f1be3500SJunchao Zhang PetscFunctionBegin; 134*f1be3500SJunchao Zhang PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr)); 135*f1be3500SJunchao Zhang PCDestroy_VPBJacobi(pc); 136*f1be3500SJunchao Zhang PetscFunctionReturn(0); 137*f1be3500SJunchao Zhang } 138*f1be3500SJunchao Zhang 139*f1be3500SJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 140*f1be3500SJunchao Zhang { 141*f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 142*f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 143*f1be3500SJunchao Zhang PetscInt i,n,nblocks,nsize = 0; 144*f1be3500SJunchao Zhang const PetscInt *bsizes; 145*f1be3500SJunchao Zhang 146*f1be3500SJunchao Zhang PetscFunctionBegin; 147*f1be3500SJunchao Zhang PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 148*f1be3500SJunchao Zhang PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes)); 149*f1be3500SJunchao Zhang for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i]; 150*f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 151*f1be3500SJunchao Zhang 152*f1be3500SJunchao Zhang /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 153*f1be3500SJunchao Zhang if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 154*f1be3500SJunchao Zhang PetscCallCXX(delete pcuda); 155*f1be3500SJunchao Zhang pcuda = nullptr; 156*f1be3500SJunchao Zhang } 157*f1be3500SJunchao Zhang 158*f1be3500SJunchao Zhang if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */ 159*f1be3500SJunchao Zhang PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag)); 160*f1be3500SJunchao Zhang } else { /* update the value only */ 161*f1be3500SJunchao Zhang PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag)); 162*f1be3500SJunchao Zhang } 163*f1be3500SJunchao Zhang 164*f1be3500SJunchao Zhang pc->ops->apply = PCApply_VPBJacobi_CUDA; 165*f1be3500SJunchao Zhang pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 166*f1be3500SJunchao Zhang PetscFunctionReturn(0); 167*f1be3500SJunchao Zhang } 168