1 #include <petscdevice.h> 2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3 4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5 struct PC_VPBJacobi_CUDA { 6 /* Cache the old sizes to check if we need realloc */ 7 PetscInt n; /* number of rows of the local matrix */ 8 PetscInt nblocks; /* number of point blocks */ 9 PetscInt nsize; /* sum of sizes of the point blocks */ 10 11 /* Helper arrays that are pre-computed on host and then copied to device. 12 bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 13 bs2: [nblocks+1], "csr" version of squares of bsizes[] 14 matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15 */ 16 PetscInt *bs_h,*bs2_h,*matIdx_h; 17 PetscInt *bs_d,*bs2_d,*matIdx_d; 18 19 MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20 21 PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h) 22 : n(n),nblocks(nblocks),nsize(nsize) 23 { 24 /* malloc memory on host and device, and then update */ 25 PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h)); 26 PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1))); 27 PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1))); 28 PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n)); 29 PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize)); 30 PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h)); 31 } 32 33 PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h) 34 { 35 PetscFunctionBegin; 36 PetscCall(ComputeOffsetsOnHost(bsizes)); 37 PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 38 PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 39 PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 40 PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice)); 41 PetscFunctionReturn(0); 42 } 43 44 ~PC_VPBJacobi_CUDA() 45 { 46 PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h)); 47 PetscCallCUDAVoid(cudaFree(bs_d)); 48 PetscCallCUDAVoid(cudaFree(bs2_d)); 49 PetscCallCUDAVoid(cudaFree(matIdx_d)); 50 PetscCallCUDAVoid(cudaFree(diag_d)); 51 } 52 53 private: 54 PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 55 { 56 PetscFunctionBegin; 57 bs_h[0] = bs2_h[0] = 0; 58 for (PetscInt i=0; i<nblocks; i++) { 59 bs_h[i+1] = bs_h[i] + bsizes[i]; 60 bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i]; 61 for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i; 62 } 63 PetscFunctionReturn(0); 64 } 65 }; 66 67 /* Like cublasDgemvBatched() but with variable-sized blocks 68 69 Input Parameters: 70 + n - number of rows of the local matrix 71 . bs - [nblocks+1], prefix sum of bsizes[] 72 . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 73 . matIdx - [n], store block/matrix index for each row 74 . A - blocks of the matrix back to back in column-major order 75 - x - the input vector 76 77 Output Parameter: 78 . y - the output vector 79 */ 80 __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y) 81 { 82 const PetscInt gridSize = gridDim.x * blockDim.x; 83 PetscInt tid = blockIdx.x*blockDim.x + threadIdx.x; 84 PetscInt i,j,k,m; 85 86 /* One row per thread. The blocks/matrices are stored in column-major order */ 87 for (; tid<n; tid += gridSize) { 88 k = matIdx[tid]; /* k-th block */ 89 m = bs[k+1] - bs[k]; /* block size of the k-th block */ 90 i = tid - bs[k]; /* i-th row of the block */ 91 A += bs2[k] + i; /* advance A to the first entry of i-th row */ 92 x += bs[k]; 93 y += bs[k]; 94 95 y[i] = 0.0; 96 for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;} 97 } 98 } 99 100 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y) 101 { 102 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 103 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 104 const PetscScalar *xx; 105 PetscScalar *yy; 106 PetscInt n; 107 108 PetscFunctionBegin; 109 if (PetscDefined(USE_DEBUG)) { 110 PetscBool isCuda; 111 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 112 if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 113 PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors"); 114 } 115 116 PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 117 if (n) { 118 PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */ 119 PetscCall(VecCUDAGetArrayRead(x,&xx)); 120 PetscCall(VecCUDAGetArrayWrite(y,&yy)); 121 MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy); 122 PetscCallCUDA(cudaGetLastError()); 123 PetscCall(VecCUDARestoreArrayRead(x,&xx)); 124 PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 125 } 126 PetscFunctionReturn(0); 127 } 128 129 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 130 { 131 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 132 133 PetscFunctionBegin; 134 PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr)); 135 PCDestroy_VPBJacobi(pc); 136 PetscFunctionReturn(0); 137 } 138 139 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 140 { 141 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 142 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 143 PetscInt i,n,nblocks,nsize = 0; 144 const PetscInt *bsizes; 145 146 PetscFunctionBegin; 147 PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 148 PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes)); 149 for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i]; 150 PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 151 152 /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 153 if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 154 PetscCallCXX(delete pcuda); 155 pcuda = nullptr; 156 } 157 158 if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */ 159 PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag)); 160 } else { /* update the value only */ 161 PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag)); 162 } 163 164 pc->ops->apply = PCApply_VPBJacobi_CUDA; 165 pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 166 PetscFunctionReturn(0); 167 } 168