1 #include <petscdevice.h> 2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3 4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5 struct PC_VPBJacobi_CUDA { 6 /* Cache the old sizes to check if we need realloc */ 7 PetscInt n; /* number of rows of the local matrix */ 8 PetscInt nblocks; /* number of point blocks */ 9 PetscInt nsize; /* sum of sizes of the point blocks */ 10 11 /* Helper arrays that are pre-computed on host and then copied to device. 12 bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 13 bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize. 14 matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15 */ 16 PetscInt *bs_h,*bs2_h,*matIdx_h; 17 PetscInt *bs_d,*bs2_d,*matIdx_d; 18 19 MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20 21 PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h) 22 : n(n),nblocks(nblocks),nsize(nsize) 23 { 24 /* malloc memory on host and device, and then update */ 25 PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h)); 26 PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1))); 27 PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1))); 28 PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n)); 29 PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize)); 30 PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h)); 31 } 32 33 PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h) 34 { 35 PetscFunctionBegin; 36 PetscCall(ComputeOffsetsOnHost(bsizes)); 37 PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 38 PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 39 PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 40 PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice)); 41 PetscCall(PetscLogCpuToGpu(sizeof(PetscInt)*(2*nblocks+2+n) + sizeof(MatScalar)*nsize)); 42 PetscFunctionReturn(0); 43 } 44 45 ~PC_VPBJacobi_CUDA() 46 { 47 PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h)); 48 PetscCallCUDAVoid(cudaFree(bs_d)); 49 PetscCallCUDAVoid(cudaFree(bs2_d)); 50 PetscCallCUDAVoid(cudaFree(matIdx_d)); 51 PetscCallCUDAVoid(cudaFree(diag_d)); 52 } 53 54 private: 55 PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 56 { 57 PetscFunctionBegin; 58 bs_h[0] = bs2_h[0] = 0; 59 for (PetscInt i=0; i<nblocks; i++) { 60 bs_h[i+1] = bs_h[i] + bsizes[i]; 61 bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i]; 62 for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i; 63 } 64 PetscFunctionReturn(0); 65 } 66 }; 67 68 /* Like cublasDgemvBatched() but with variable-sized blocks 69 70 Input Parameters: 71 + n - number of rows of the local matrix 72 . bs - [nblocks+1], prefix sum of bsizes[] 73 . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 74 . matIdx - [n], store block/matrix index for each row 75 . A - blocks of the matrix back to back in column-major order 76 - x - the input vector 77 78 Output Parameter: 79 . y - the output vector 80 */ 81 __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y) 82 { 83 const PetscInt gridSize = gridDim.x * blockDim.x; 84 PetscInt tid = blockIdx.x*blockDim.x + threadIdx.x; 85 PetscInt i,j,k,m; 86 87 /* One row per thread. The blocks/matrices are stored in column-major order */ 88 for (; tid<n; tid += gridSize) { 89 k = matIdx[tid]; /* k-th block */ 90 m = bs[k+1] - bs[k]; /* block size of the k-th block */ 91 i = tid - bs[k]; /* i-th row of the block */ 92 A += bs2[k] + i; /* advance A to the first entry of i-th row */ 93 x += bs[k]; 94 y += bs[k]; 95 96 y[i] = 0.0; 97 for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;} 98 } 99 } 100 101 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y) 102 { 103 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 104 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 105 const PetscScalar *xx; 106 PetscScalar *yy; 107 PetscInt n; 108 109 PetscFunctionBegin; 110 PetscCall(PetscLogGpuTimeBegin()); 111 if (PetscDefined(USE_DEBUG)) { 112 PetscBool isCuda; 113 PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 114 if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 115 PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors"); 116 } 117 118 PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 119 if (n) { 120 PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */ 121 PetscCall(VecCUDAGetArrayRead(x,&xx)); 122 PetscCall(VecCUDAGetArrayWrite(y,&yy)); 123 MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy); 124 PetscCallCUDA(cudaGetLastError()); 125 PetscCall(VecCUDARestoreArrayRead(x,&xx)); 126 PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 127 } 128 PetscCall(PetscLogGpuFlops(pcuda->nsize*2)); /* FMA on entries in all blocks */ 129 PetscCall(PetscLogGpuTimeEnd()); 130 PetscFunctionReturn(0); 131 } 132 133 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 134 { 135 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 136 137 PetscFunctionBegin; 138 PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr)); 139 PCDestroy_VPBJacobi(pc); 140 PetscFunctionReturn(0); 141 } 142 143 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 144 { 145 PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 146 PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 147 PetscInt i,n,nblocks,nsize = 0; 148 const PetscInt *bsizes; 149 150 PetscFunctionBegin; 151 PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 152 PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes)); 153 for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i]; 154 PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 155 156 /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 157 if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 158 PetscCallCXX(delete pcuda); 159 pcuda = nullptr; 160 } 161 162 if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */ 163 PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag)); 164 } else { /* update the value only */ 165 PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag)); 166 } 167 168 pc->ops->apply = PCApply_VPBJacobi_CUDA; 169 pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 170 PetscFunctionReturn(0); 171 } 172