xref: /petsc/src/ksp/pc/impls/vpbjacobi/cuda/vpbjacobi_cuda.cu (revision 9a56b47496cad4948bae971196c35464b04e1dcc)
1f1be3500SJunchao Zhang #include <petscdevice.h>
2f1be3500SJunchao Zhang #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
3f1be3500SJunchao Zhang 
4f1be3500SJunchao Zhang /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5f1be3500SJunchao Zhang struct PC_VPBJacobi_CUDA {
6f1be3500SJunchao Zhang   /* Cache the old sizes to check if we need realloc */
7f1be3500SJunchao Zhang   PetscInt  n; /* number of rows of the local matrix */
8f1be3500SJunchao Zhang   PetscInt  nblocks; /* number of point blocks */
9f1be3500SJunchao Zhang   PetscInt  nsize; /* sum of sizes of the point blocks */
10f1be3500SJunchao Zhang 
11f1be3500SJunchao Zhang   /* Helper arrays that are pre-computed on host and then copied to device.
12f1be3500SJunchao Zhang     bs:     [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
13*9a56b474SJunchao Zhang     bs2:    [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
14f1be3500SJunchao Zhang     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15f1be3500SJunchao Zhang   */
16f1be3500SJunchao Zhang   PetscInt   *bs_h,*bs2_h,*matIdx_h;
17f1be3500SJunchao Zhang   PetscInt   *bs_d,*bs2_d,*matIdx_d;
18f1be3500SJunchao Zhang 
19f1be3500SJunchao Zhang   MatScalar  *diag_d; /* [nsize], store inverse of the point blocks on device */
20f1be3500SJunchao Zhang 
21f1be3500SJunchao Zhang   PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h)
22f1be3500SJunchao Zhang     : n(n),nblocks(nblocks),nsize(nsize)
23f1be3500SJunchao Zhang   {
24f1be3500SJunchao Zhang     /* malloc memory on host and device, and then update */
25f1be3500SJunchao Zhang     PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h));
26f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1)));
27f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1)));
28f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n));
29f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize));
30f1be3500SJunchao Zhang     PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h));
31f1be3500SJunchao Zhang   }
32f1be3500SJunchao Zhang 
33f1be3500SJunchao Zhang   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h)
34f1be3500SJunchao Zhang   {
35f1be3500SJunchao Zhang     PetscFunctionBegin;
36f1be3500SJunchao Zhang     PetscCall(ComputeOffsetsOnHost(bsizes));
37f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice));
38f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice));
39f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice));
40f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice));
41*9a56b474SJunchao Zhang     PetscCall(PetscLogCpuToGpu(sizeof(PetscInt)*(2*nblocks+2+n) + sizeof(MatScalar)*nsize));
42f1be3500SJunchao Zhang     PetscFunctionReturn(0);
43f1be3500SJunchao Zhang   }
44f1be3500SJunchao Zhang 
45f1be3500SJunchao Zhang   ~PC_VPBJacobi_CUDA()
46f1be3500SJunchao Zhang   {
47f1be3500SJunchao Zhang     PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h));
48f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(bs_d));
49f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(bs2_d));
50f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(matIdx_d));
51f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(diag_d));
52f1be3500SJunchao Zhang   }
53f1be3500SJunchao Zhang 
54f1be3500SJunchao Zhang private:
55f1be3500SJunchao Zhang   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
56f1be3500SJunchao Zhang   {
57f1be3500SJunchao Zhang     PetscFunctionBegin;
58f1be3500SJunchao Zhang     bs_h[0] = bs2_h[0] = 0;
59f1be3500SJunchao Zhang     for (PetscInt i=0; i<nblocks; i++) {
60f1be3500SJunchao Zhang       bs_h[i+1]  = bs_h[i]  + bsizes[i];
61f1be3500SJunchao Zhang       bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i];
62f1be3500SJunchao Zhang       for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i;
63f1be3500SJunchao Zhang     }
64f1be3500SJunchao Zhang     PetscFunctionReturn(0);
65f1be3500SJunchao Zhang   }
66f1be3500SJunchao Zhang };
67f1be3500SJunchao Zhang 
68f1be3500SJunchao Zhang /* Like cublasDgemvBatched() but with variable-sized blocks
69f1be3500SJunchao Zhang 
70f1be3500SJunchao Zhang   Input Parameters:
71f1be3500SJunchao Zhang + n       - number of rows of the local matrix
72f1be3500SJunchao Zhang . bs      - [nblocks+1], prefix sum of bsizes[]
73f1be3500SJunchao Zhang . bs2     - [nblocks+1], prefix sum of squares of bsizes[]
74f1be3500SJunchao Zhang . matIdx  - [n], store block/matrix index for each row
75f1be3500SJunchao Zhang . A       - blocks of the matrix back to back in column-major order
76f1be3500SJunchao Zhang - x       - the input vector
77f1be3500SJunchao Zhang 
78f1be3500SJunchao Zhang   Output Parameter:
79f1be3500SJunchao Zhang . y - the output vector
80f1be3500SJunchao Zhang */
81f1be3500SJunchao Zhang __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y)
82f1be3500SJunchao Zhang {
83f1be3500SJunchao Zhang   const PetscInt gridSize = gridDim.x * blockDim.x;
84f1be3500SJunchao Zhang   PetscInt       tid = blockIdx.x*blockDim.x + threadIdx.x;
85f1be3500SJunchao Zhang   PetscInt       i,j,k,m;
86f1be3500SJunchao Zhang 
87f1be3500SJunchao Zhang   /* One row per thread. The blocks/matrices are stored in column-major order */
88f1be3500SJunchao Zhang   for (; tid<n; tid += gridSize) {
89f1be3500SJunchao Zhang     k  = matIdx[tid];     /* k-th block */
90f1be3500SJunchao Zhang     m  = bs[k+1] - bs[k]; /* block size of the k-th block */
91f1be3500SJunchao Zhang     i  = tid - bs[k];     /* i-th row of the block */
92f1be3500SJunchao Zhang     A += bs2[k] + i;      /* advance A to the first entry of i-th row */
93f1be3500SJunchao Zhang     x += bs[k];
94f1be3500SJunchao Zhang     y += bs[k];
95f1be3500SJunchao Zhang 
96f1be3500SJunchao Zhang     y[i] = 0.0;
97f1be3500SJunchao Zhang     for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;}
98f1be3500SJunchao Zhang   }
99f1be3500SJunchao Zhang }
100f1be3500SJunchao Zhang 
101f1be3500SJunchao Zhang static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y)
102f1be3500SJunchao Zhang {
103f1be3500SJunchao Zhang   PC_VPBJacobi      *jac = (PC_VPBJacobi*)pc->data;
104f1be3500SJunchao Zhang   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr);
105f1be3500SJunchao Zhang   const PetscScalar *xx;
106f1be3500SJunchao Zhang   PetscScalar       *yy;
107f1be3500SJunchao Zhang   PetscInt          n;
108f1be3500SJunchao Zhang 
109f1be3500SJunchao Zhang   PetscFunctionBegin;
110*9a56b474SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
111f1be3500SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
112f1be3500SJunchao Zhang     PetscBool  isCuda;
113f1be3500SJunchao Zhang     PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,""));
114f1be3500SJunchao Zhang     if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,""));
115f1be3500SJunchao Zhang     PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors");
116f1be3500SJunchao Zhang   }
117f1be3500SJunchao Zhang 
118f1be3500SJunchao Zhang   PetscCall(MatGetLocalSize(pc->pmat,&n,NULL));
119f1be3500SJunchao Zhang   if (n) {
120f1be3500SJunchao Zhang     PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */
121f1be3500SJunchao Zhang     PetscCall(VecCUDAGetArrayRead(x,&xx));
122f1be3500SJunchao Zhang     PetscCall(VecCUDAGetArrayWrite(y,&yy));
123f1be3500SJunchao Zhang     MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy);
124f1be3500SJunchao Zhang     PetscCallCUDA(cudaGetLastError());
125f1be3500SJunchao Zhang     PetscCall(VecCUDARestoreArrayRead(x,&xx));
126f1be3500SJunchao Zhang     PetscCall(VecCUDARestoreArrayWrite(y,&yy));
127f1be3500SJunchao Zhang   }
128*9a56b474SJunchao Zhang   PetscCall(PetscLogGpuFlops(pcuda->nsize*2)); /* FMA on entries in all blocks */
129*9a56b474SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
130f1be3500SJunchao Zhang   PetscFunctionReturn(0);
131f1be3500SJunchao Zhang }
132f1be3500SJunchao Zhang 
133f1be3500SJunchao Zhang static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
134f1be3500SJunchao Zhang {
135f1be3500SJunchao Zhang   PC_VPBJacobi       *jac = (PC_VPBJacobi*)pc->data;
136f1be3500SJunchao Zhang 
137f1be3500SJunchao Zhang   PetscFunctionBegin;
138f1be3500SJunchao Zhang   PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr));
139f1be3500SJunchao Zhang   PCDestroy_VPBJacobi(pc);
140f1be3500SJunchao Zhang   PetscFunctionReturn(0);
141f1be3500SJunchao Zhang }
142f1be3500SJunchao Zhang 
143f1be3500SJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc)
144f1be3500SJunchao Zhang {
145f1be3500SJunchao Zhang   PC_VPBJacobi      *jac = (PC_VPBJacobi*)pc->data;
146f1be3500SJunchao Zhang   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr);
147f1be3500SJunchao Zhang   PetscInt          i,n,nblocks,nsize = 0;
148f1be3500SJunchao Zhang   const PetscInt    *bsizes;
149f1be3500SJunchao Zhang 
150f1be3500SJunchao Zhang   PetscFunctionBegin;
151f1be3500SJunchao Zhang   PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
152f1be3500SJunchao Zhang   PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes));
153f1be3500SJunchao Zhang   for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i];
154f1be3500SJunchao Zhang   PetscCall(MatGetLocalSize(pc->pmat,&n,NULL));
155f1be3500SJunchao Zhang 
156f1be3500SJunchao Zhang   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
157f1be3500SJunchao Zhang   if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
158f1be3500SJunchao Zhang     PetscCallCXX(delete pcuda);
159f1be3500SJunchao Zhang     pcuda = nullptr;
160f1be3500SJunchao Zhang   }
161f1be3500SJunchao Zhang 
162f1be3500SJunchao Zhang   if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */
163f1be3500SJunchao Zhang     PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag));
164f1be3500SJunchao Zhang   } else { /* update the value only */
165f1be3500SJunchao Zhang     PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag));
166f1be3500SJunchao Zhang   }
167f1be3500SJunchao Zhang 
168f1be3500SJunchao Zhang   pc->ops->apply   = PCApply_VPBJacobi_CUDA;
169f1be3500SJunchao Zhang   pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
170f1be3500SJunchao Zhang   PetscFunctionReturn(0);
171f1be3500SJunchao Zhang }
172