xref: /petsc/src/ksp/pc/impls/vpbjacobi/cuda/vpbjacobi_cuda.cu (revision b8d709ab44c354dba7591c0191c0984ef3bb0cd5)
1 #include <petscdevice_cuda.h>
2 #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
3 
4 /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5 struct PC_VPBJacobi_CUDA {
6   /* Cache the old sizes to check if we need realloc */
7   PetscInt n;       /* number of rows of the local matrix */
8   PetscInt nblocks; /* number of point blocks */
9   PetscInt nsize;   /* sum of sizes of the point blocks */
10 
11   /* Helper arrays that are pre-computed on host and then copied to device.
12     bs:     [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
13     bs2:    [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
14     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15   */
16   PetscInt *bs_h, *bs2_h, *matIdx_h;
17   PetscInt *bs_d, *bs2_d, *matIdx_d;
18 
19   MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */
20 
21   PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize)
22   {
23     /* malloc memory on host and device, and then update */
24     PetscCallVoid(PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h));
25     PetscCallCUDAVoid(cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1)));
26     PetscCallCUDAVoid(cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1)));
27     PetscCallCUDAVoid(cudaMalloc(&matIdx_d, sizeof(PetscInt) * n));
28     PetscCallCUDAVoid(cudaMalloc(&diag_d, sizeof(MatScalar) * nsize));
29     PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_h));
30   }
31 
32   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h)
33   {
34     PetscFunctionBegin;
35     PetscCall(ComputeOffsetsOnHost(bsizes));
36     PetscCallCUDA(cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
37     PetscCallCUDA(cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
38     PetscCallCUDA(cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
39     PetscCallCUDA(cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice));
40     PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize));
41     PetscFunctionReturn(0);
42   }
43 
44   ~PC_VPBJacobi_CUDA()
45   {
46     PetscCallVoid(PetscFree3(bs_h, bs2_h, matIdx_h));
47     PetscCallCUDAVoid(cudaFree(bs_d));
48     PetscCallCUDAVoid(cudaFree(bs2_d));
49     PetscCallCUDAVoid(cudaFree(matIdx_d));
50     PetscCallCUDAVoid(cudaFree(diag_d));
51   }
52 
53 private:
54   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
55   {
56     PetscFunctionBegin;
57     bs_h[0] = bs2_h[0] = 0;
58     for (PetscInt i = 0; i < nblocks; i++) {
59       bs_h[i + 1]  = bs_h[i] + bsizes[i];
60       bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
61       for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
62     }
63     PetscFunctionReturn(0);
64   }
65 };
66 
67 /* Like cublasDgemvBatched() but with variable-sized blocks
68 
69   Input Parameters:
70 + n       - number of rows of the local matrix
71 . bs      - [nblocks+1], prefix sum of bsizes[]
72 . bs2     - [nblocks+1], prefix sum of squares of bsizes[]
73 . matIdx  - [n], store block/matrix index for each row
74 . A       - blocks of the matrix back to back in column-major order
75 - x       - the input vector
76 
77   Output Parameter:
78 . y - the output vector
79 */
80 __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y)
81 {
82   const PetscInt gridSize = gridDim.x * blockDim.x;
83   PetscInt       tid      = blockIdx.x * blockDim.x + threadIdx.x;
84   PetscInt       i, j, k, m;
85 
86   /* One row per thread. The blocks/matrices are stored in column-major order */
87   for (; tid < n; tid += gridSize) {
88     k = matIdx[tid];       /* k-th block */
89     m = bs[k + 1] - bs[k]; /* block size of the k-th block */
90     i = tid - bs[k];       /* i-th row of the block */
91     A += bs2[k] + i;       /* advance A to the first entry of i-th row */
92     x += bs[k];
93     y += bs[k];
94 
95     y[i] = 0.0;
96     for (j = 0; j < m; j++) {
97       y[i] += A[0] * x[j];
98       A += m;
99     }
100   }
101 }
102 
103 static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
104 {
105   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
106   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
107   const PetscScalar *xx;
108   PetscScalar       *yy;
109   PetscInt           n;
110 
111   PetscFunctionBegin;
112   PetscCall(PetscLogGpuTimeBegin());
113   if (PetscDefined(USE_DEBUG)) {
114     PetscBool isCuda;
115     PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
116     if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
117     PetscCheck(isCuda, PETSC_COMM_SELF, PETSC_ERR_SUP, "PC: applying a CUDA pmat to non-cuda vectors");
118   }
119 
120   PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
121   if (n) {
122     PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */
123     PetscCall(VecCUDAGetArrayRead(x, &xx));
124     PetscCall(VecCUDAGetArrayWrite(y, &yy));
125     MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy);
126     PetscCallCUDA(cudaGetLastError());
127     PetscCall(VecCUDARestoreArrayRead(x, &xx));
128     PetscCall(VecCUDARestoreArrayWrite(y, &yy));
129   }
130   PetscCall(PetscLogGpuFlops(pcuda->nsize * 2)); /* FMA on entries in all blocks */
131   PetscCall(PetscLogGpuTimeEnd());
132   PetscFunctionReturn(0);
133 }
134 
135 static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
136 {
137   PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
138 
139   PetscFunctionBegin;
140   PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr));
141   PCDestroy_VPBJacobi(pc);
142   PetscFunctionReturn(0);
143 }
144 
145 PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc)
146 {
147   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
148   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
149   PetscInt           i, n, nblocks, nsize = 0;
150   const PetscInt    *bsizes;
151 
152   PetscFunctionBegin;
153   PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
154   PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes));
155   for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
156   PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
157 
158   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
159   if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
160     PetscCallCXX(delete pcuda);
161     pcuda = nullptr;
162   }
163 
164   if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */
165     PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag));
166   } else { /* update the value only */
167     PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag));
168   }
169 
170   pc->ops->apply   = PCApply_VPBJacobi_CUDA;
171   pc->ops->destroy = PCDestroy_VPBJacobi_CUDA;
172   PetscFunctionReturn(0);
173 }
174