xref: /petsc/src/ksp/pc/impls/vpbjacobi/cuda/vpbjacobi_cuda.cu (revision f1be35001843913b137132cb0fff0c609e27f59e)
1 #include <petscdevice.h>
2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
3 
4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5 struct PC_VPBJacobi_CUDA {
6   /* Cache the old sizes to check if we need realloc */
7   PetscInt  n; /* number of rows of the local matrix */
8   PetscInt  nblocks; /* number of point blocks */
9   PetscInt  nsize; /* sum of sizes of the point blocks */
10 
11   /* Helper arrays that are pre-computed on host and then copied to device.
12     bs:     [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
13     bs2:    [nblocks+1], "csr" version of squares of bsizes[]
14     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15   */
16   PetscInt   *bs_h,*bs2_h,*matIdx_h;
17   PetscInt   *bs_d,*bs2_d,*matIdx_d;
18 
19   MatScalar  *diag_d; /* [nsize], store inverse of the point blocks on device */
20 
21   PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h)
22     : n(n),nblocks(nblocks),nsize(nsize)
23   {
24     /* malloc memory on host and device, and then update */
25     PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h));
26     PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1)));
27     PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1)));
28     PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n));
29     PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize));
30     PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h));
31   }
32 
33   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h)
34   {
35     PetscFunctionBegin;
36     PetscCall(ComputeOffsetsOnHost(bsizes));
37     PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice));
38     PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice));
39     PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice));
40     PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice));
41     PetscFunctionReturn(0);
42   }
43 
44   ~PC_VPBJacobi_CUDA()
45   {
46     PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h));
47     PetscCallCUDAVoid(cudaFree(bs_d));
48     PetscCallCUDAVoid(cudaFree(bs2_d));
49     PetscCallCUDAVoid(cudaFree(matIdx_d));
50     PetscCallCUDAVoid(cudaFree(diag_d));
51   }
52 
53 private:
54   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
55   {
56     PetscFunctionBegin;
57     bs_h[0] = bs2_h[0] = 0;
58     for (PetscInt i=0; i<nblocks; i++) {
59       bs_h[i+1]  = bs_h[i]  + bsizes[i];
60       bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i];
61       for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i;
62     }
63     PetscFunctionReturn(0);
64   }
65 };
66 
67 /* Like cublasDgemvBatched() but with variable-sized blocks
68 
69   Input Parameters:
70 + n       - number of rows of the local matrix
71 . bs      - [nblocks+1], prefix sum of bsizes[]
72 . bs2     - [nblocks+1], prefix sum of squares of bsizes[]
73 . matIdx  - [n], store block/matrix index for each row
74 . A       - blocks of the matrix back to back in column-major order
75 - x       - the input vector
76 
77   Output Parameter:
78 . y - the output vector
79 */
80 __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y)
81 {
82   const PetscInt gridSize = gridDim.x * blockDim.x;
83   PetscInt       tid = blockIdx.x*blockDim.x + threadIdx.x;
84   PetscInt       i,j,k,m;
85 
86   /* One row per thread. The blocks/matrices are stored in column-major order */
87   for (; tid<n; tid += gridSize) {
88     k  = matIdx[tid];     /* k-th block */
89     m  = bs[k+1] - bs[k]; /* block size of the k-th block */
90     i  = tid - bs[k];     /* i-th row of the block */
91     A += bs2[k] + i;      /* advance A to the first entry of i-th row */
92     x += bs[k];
93     y += bs[k];
94 
95     y[i] = 0.0;
96     for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;}
97   }
98 }
99 
100 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y)
101 {
102   PC_VPBJacobi      *jac = (PC_VPBJacobi*)pc->data;
103   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr);
104   const PetscScalar *xx;
105   PetscScalar       *yy;
106   PetscInt          n;
107 
108   PetscFunctionBegin;
109   if (PetscDefined(USE_DEBUG)) {
110     PetscBool  isCuda;
111     PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,""));
112     if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,""));
113     PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors");
114   }
115 
116   PetscCall(MatGetLocalSize(pc->pmat,&n,NULL));
117   if (n) {
118     PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */
119     PetscCall(VecCUDAGetArrayRead(x,&xx));
120     PetscCall(VecCUDAGetArrayWrite(y,&yy));
121     MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy);
122     PetscCallCUDA(cudaGetLastError());
123     PetscCall(VecCUDARestoreArrayRead(x,&xx));
124     PetscCall(VecCUDARestoreArrayWrite(y,&yy));
125   }
126   PetscFunctionReturn(0);
127 }
128 
129 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
130 {
131   PC_VPBJacobi       *jac = (PC_VPBJacobi*)pc->data;
132 
133   PetscFunctionBegin;
134   PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr));
135   PCDestroy_VPBJacobi(pc);
136   PetscFunctionReturn(0);
137 }
138 
139 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc)
140 {
141   PC_VPBJacobi      *jac = (PC_VPBJacobi*)pc->data;
142   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr);
143   PetscInt          i,n,nblocks,nsize = 0;
144   const PetscInt    *bsizes;
145 
146   PetscFunctionBegin;
147   PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
148   PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes));
149   for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i];
150   PetscCall(MatGetLocalSize(pc->pmat,&n,NULL));
151 
152   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
153   if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
154     PetscCallCXX(delete pcuda);
155     pcuda = nullptr;
156   }
157 
158   if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */
159     PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag));
160   } else { /* update the value only */
161     PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag));
162   }
163 
164   pc->ops->apply   = PCApply_VPBJacobi_CUDA;
165   pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
166   PetscFunctionReturn(0);
167 }
168