xref: /petsc/src/ksp/pc/impls/vpbjacobi/cuda/vpbjacobi_cuda.cu (revision 4a9974635a2580e0cbff88854d61b4e213d7618d)
10e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h>
2f1be3500SJunchao Zhang #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
3f1be3500SJunchao Zhang 
4f1be3500SJunchao Zhang /* A class that manages helper arrays assisting parallel PCApply() with CUDA */
5f1be3500SJunchao Zhang struct PC_VPBJacobi_CUDA {
6f1be3500SJunchao Zhang   /* Cache the old sizes to check if we need realloc */
7f1be3500SJunchao Zhang   PetscInt n;       /* number of rows of the local matrix */
8f1be3500SJunchao Zhang   PetscInt nblocks; /* number of point blocks */
9f1be3500SJunchao Zhang   PetscInt nsize;   /* sum of sizes of the point blocks */
10f1be3500SJunchao Zhang 
11f1be3500SJunchao Zhang   /* Helper arrays that are pre-computed on host and then copied to device.
12f1be3500SJunchao Zhang     bs:     [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n.
139a56b474SJunchao Zhang     bs2:    [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize.
14f1be3500SJunchao Zhang     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
15f1be3500SJunchao Zhang   */
16f1be3500SJunchao Zhang   PetscInt *bs_h, *bs2_h, *matIdx_h;
17f1be3500SJunchao Zhang   PetscInt *bs_d, *bs2_d, *matIdx_d;
18f1be3500SJunchao Zhang 
19f1be3500SJunchao Zhang   MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */
20f1be3500SJunchao Zhang 
PC_VPBJacobi_CUDAPC_VPBJacobi_CUDA21d71ae5a4SJacob Faibussowitsch   PC_VPBJacobi_CUDA(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_h) : n(n), nblocks(nblocks), nsize(nsize)
22d71ae5a4SJacob Faibussowitsch   {
23f1be3500SJunchao Zhang     /* malloc memory on host and device, and then update */
24f1be3500SJunchao Zhang     PetscCallVoid(PetscMalloc3(nblocks + 1, &bs_h, nblocks + 1, &bs2_h, n, &matIdx_h));
25f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&bs_d, sizeof(PetscInt) * (nblocks + 1)));
26f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&bs2_d, sizeof(PetscInt) * (nblocks + 1)));
27f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&matIdx_d, sizeof(PetscInt) * n));
28f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaMalloc(&diag_d, sizeof(MatScalar) * nsize));
29f1be3500SJunchao Zhang     PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_h));
30f1be3500SJunchao Zhang   }
31f1be3500SJunchao Zhang 
UpdateOffsetsOnDevicePC_VPBJacobi_CUDA32d71ae5a4SJacob Faibussowitsch   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_h)
33d71ae5a4SJacob Faibussowitsch   {
34f1be3500SJunchao Zhang     PetscFunctionBegin;
35f1be3500SJunchao Zhang     PetscCall(ComputeOffsetsOnHost(bsizes));
36f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(bs_d, bs_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
37f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(bs2_d, bs2_h, sizeof(PetscInt) * (nblocks + 1), cudaMemcpyHostToDevice));
38f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(matIdx_d, matIdx_h, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
39f1be3500SJunchao Zhang     PetscCallCUDA(cudaMemcpy(diag_d, diag_h, sizeof(MatScalar) * nsize, cudaMemcpyHostToDevice));
409a56b474SJunchao Zhang     PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize));
413ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
42f1be3500SJunchao Zhang   }
43f1be3500SJunchao Zhang 
~PC_VPBJacobi_CUDAPC_VPBJacobi_CUDA44d71ae5a4SJacob Faibussowitsch   ~PC_VPBJacobi_CUDA()
45d71ae5a4SJacob Faibussowitsch   {
46f1be3500SJunchao Zhang     PetscCallVoid(PetscFree3(bs_h, bs2_h, matIdx_h));
47f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(bs_d));
48f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(bs2_d));
49f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(matIdx_d));
50f1be3500SJunchao Zhang     PetscCallCUDAVoid(cudaFree(diag_d));
51f1be3500SJunchao Zhang   }
52f1be3500SJunchao Zhang 
53f1be3500SJunchao Zhang private:
ComputeOffsetsOnHostPC_VPBJacobi_CUDA54d71ae5a4SJacob Faibussowitsch   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
55d71ae5a4SJacob Faibussowitsch   {
56f1be3500SJunchao Zhang     PetscFunctionBegin;
57f1be3500SJunchao Zhang     bs_h[0] = bs2_h[0] = 0;
58f1be3500SJunchao Zhang     for (PetscInt i = 0; i < nblocks; i++) {
59f1be3500SJunchao Zhang       bs_h[i + 1]  = bs_h[i] + bsizes[i];
60f1be3500SJunchao Zhang       bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
61f1be3500SJunchao Zhang       for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
62f1be3500SJunchao Zhang     }
633ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
64f1be3500SJunchao Zhang   }
65f1be3500SJunchao Zhang };
66f1be3500SJunchao Zhang 
67f1be3500SJunchao Zhang /* Like cublasDgemvBatched() but with variable-sized blocks
68f1be3500SJunchao Zhang 
69f1be3500SJunchao Zhang   Input Parameters:
70f1be3500SJunchao Zhang + n       - number of rows of the local matrix
71f1be3500SJunchao Zhang . bs      - [nblocks+1], prefix sum of bsizes[]
72f1be3500SJunchao Zhang . bs2     - [nblocks+1], prefix sum of squares of bsizes[]
73f1be3500SJunchao Zhang . matIdx  - [n], store block/matrix index for each row
74f1be3500SJunchao Zhang . A       - blocks of the matrix back to back in column-major order
7569eda9daSJed Brown . x       - the input vector
7669eda9daSJed Brown - transpose - whether it is MatMult for Ax (false) or MatMultTranspose for A^Tx (true)
77f1be3500SJunchao Zhang 
78f1be3500SJunchao Zhang   Output Parameter:
79f1be3500SJunchao Zhang . y - the output vector
80f1be3500SJunchao Zhang */
MatMultBatched(PetscInt n,const PetscInt * bs,const PetscInt * bs2,const PetscInt * matIdx,const MatScalar * A,const PetscScalar * x,PetscScalar * y,PetscBool transpose)8169eda9daSJed Brown __global__ static void MatMultBatched(PetscInt n, const PetscInt *bs, const PetscInt *bs2, const PetscInt *matIdx, const MatScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
82d71ae5a4SJacob Faibussowitsch {
83f1be3500SJunchao Zhang   const PetscInt gridSize = gridDim.x * blockDim.x;
84f1be3500SJunchao Zhang   PetscInt       tid      = blockIdx.x * blockDim.x + threadIdx.x;
85f1be3500SJunchao Zhang   PetscInt       i, j, k, m;
86f1be3500SJunchao Zhang 
87f1be3500SJunchao Zhang   /* One row per thread. The blocks/matrices are stored in column-major order */
88f1be3500SJunchao Zhang   for (; tid < n; tid += gridSize) {
89f1be3500SJunchao Zhang     k = matIdx[tid];                       /* k-th block */
90f1be3500SJunchao Zhang     m = bs[k + 1] - bs[k];                 /* block size of the k-th block */
91f1be3500SJunchao Zhang     i = tid - bs[k];                       /* i-th row of the block */
9269eda9daSJed Brown     A += bs2[k] + i * (transpose ? m : 1); /* advance A to the first entry of i-th row */
93f1be3500SJunchao Zhang     x += bs[k];
94f1be3500SJunchao Zhang     y += bs[k];
95f1be3500SJunchao Zhang 
96f1be3500SJunchao Zhang     y[i] = 0.0;
979371c9d4SSatish Balay     for (j = 0; j < m; j++) {
989371c9d4SSatish Balay       y[i] += A[0] * x[j];
9969eda9daSJed Brown       A += (transpose ? 1 : m);
1009371c9d4SSatish Balay     }
101f1be3500SJunchao Zhang   }
102f1be3500SJunchao Zhang }
103f1be3500SJunchao Zhang 
PCApplyOrTranspose_VPBJacobi_CUDA(PC pc,Vec x,Vec y,PetscBool transpose)10469eda9daSJed Brown static PetscErrorCode PCApplyOrTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y, PetscBool transpose)
105d71ae5a4SJacob Faibussowitsch {
106f1be3500SJunchao Zhang   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
107f1be3500SJunchao Zhang   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
108f1be3500SJunchao Zhang   const PetscScalar *xx;
109f1be3500SJunchao Zhang   PetscScalar       *yy;
110f1be3500SJunchao Zhang   PetscInt           n;
111f1be3500SJunchao Zhang 
112f1be3500SJunchao Zhang   PetscFunctionBegin;
1139a56b474SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
114f1be3500SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
115f1be3500SJunchao Zhang     PetscBool isCuda;
116f1be3500SJunchao Zhang     PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
117f1be3500SJunchao Zhang     if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &isCuda, VECSEQCUDA, VECMPICUDA, ""));
118f1be3500SJunchao Zhang     PetscCheck(isCuda, PETSC_COMM_SELF, PETSC_ERR_SUP, "PC: applying a CUDA pmat to non-cuda vectors");
119f1be3500SJunchao Zhang   }
120f1be3500SJunchao Zhang 
121f1be3500SJunchao Zhang   PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
122f1be3500SJunchao Zhang   if (n) {
123f1be3500SJunchao Zhang     PetscInt gridSize = PetscMin((n + 255) / 256, 2147483647); /* <= 2^31-1 */
124f1be3500SJunchao Zhang     PetscCall(VecCUDAGetArrayRead(x, &xx));
125f1be3500SJunchao Zhang     PetscCall(VecCUDAGetArrayWrite(y, &yy));
12669eda9daSJed Brown     MatMultBatched<<<gridSize, 256>>>(n, pcuda->bs_d, pcuda->bs2_d, pcuda->matIdx_d, pcuda->diag_d, xx, yy, transpose);
127f1be3500SJunchao Zhang     PetscCallCUDA(cudaGetLastError());
128f1be3500SJunchao Zhang     PetscCall(VecCUDARestoreArrayRead(x, &xx));
129f1be3500SJunchao Zhang     PetscCall(VecCUDARestoreArrayWrite(y, &yy));
130f1be3500SJunchao Zhang   }
1319a56b474SJunchao Zhang   PetscCall(PetscLogGpuFlops(pcuda->nsize * 2)); /* FMA on entries in all blocks */
1329a56b474SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
134f1be3500SJunchao Zhang }
135f1be3500SJunchao Zhang 
PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y)13669eda9daSJed Brown static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
13769eda9daSJed Brown {
13869eda9daSJed Brown   PetscFunctionBegin;
13969eda9daSJed Brown   PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_FALSE));
1403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14169eda9daSJed Brown }
14269eda9daSJed Brown 
PCApplyTranspose_VPBJacobi_CUDA(PC pc,Vec x,Vec y)14369eda9daSJed Brown static PetscErrorCode PCApplyTranspose_VPBJacobi_CUDA(PC pc, Vec x, Vec y)
14469eda9daSJed Brown {
14569eda9daSJed Brown   PetscFunctionBegin;
14669eda9daSJed Brown   PetscCall(PCApplyOrTranspose_VPBJacobi_CUDA(pc, x, y, PETSC_TRUE));
1473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14869eda9daSJed Brown }
14969eda9daSJed Brown 
PCDestroy_VPBJacobi_CUDA(PC pc)150d71ae5a4SJacob Faibussowitsch static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc)
151d71ae5a4SJacob Faibussowitsch {
152f1be3500SJunchao Zhang   PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
153f1be3500SJunchao Zhang 
154f1be3500SJunchao Zhang   PetscFunctionBegin;
155f1be3500SJunchao Zhang   PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA *>(jac->spptr));
1563ba16761SJacob Faibussowitsch   PetscCall(PCDestroy_VPBJacobi(pc));
1573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
158f1be3500SJunchao Zhang }
159f1be3500SJunchao Zhang 
PCSetUp_VPBJacobi_CUDA(PC pc,Mat diagVPB)160*42ce410bSJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc, Mat diagVPB)
161d71ae5a4SJacob Faibussowitsch {
162f1be3500SJunchao Zhang   PC_VPBJacobi      *jac   = (PC_VPBJacobi *)pc->data;
163f1be3500SJunchao Zhang   PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA *>(jac->spptr);
164f1be3500SJunchao Zhang   PetscInt           i, n, nblocks, nsize = 0;
165f1be3500SJunchao Zhang   const PetscInt    *bsizes;
166f1be3500SJunchao Zhang 
167f1be3500SJunchao Zhang   PetscFunctionBegin;
168*42ce410bSJunchao Zhang   PetscCall(PCSetUp_VPBJacobi_Host(pc, diagVPB)); /* Compute the inverse on host now. Might worth doing it on device directly */
169f1be3500SJunchao Zhang   PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes));
170f1be3500SJunchao Zhang   for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
171f1be3500SJunchao Zhang   PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
172f1be3500SJunchao Zhang 
173f1be3500SJunchao Zhang   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
174f1be3500SJunchao Zhang   if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) {
175f1be3500SJunchao Zhang     PetscCallCXX(delete pcuda);
176f1be3500SJunchao Zhang     pcuda = nullptr;
177f1be3500SJunchao Zhang   }
178f1be3500SJunchao Zhang 
17935cb6cd3SPierre Jolivet   if (!pcuda) { /* allocate the struct along with the helper arrays from the scratch */
180f1be3500SJunchao Zhang     PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n, nblocks, nsize, bsizes, jac->diag));
181f1be3500SJunchao Zhang   } else { /* update the value only */
182f1be3500SJunchao Zhang     PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes, jac->diag));
183f1be3500SJunchao Zhang   }
184f1be3500SJunchao Zhang 
185f1be3500SJunchao Zhang   pc->ops->apply          = PCApply_VPBJacobi_CUDA;
18669eda9daSJed Brown   pc->ops->applytranspose = PCApplyTranspose_VPBJacobi_CUDA;
187f1be3500SJunchao Zhang   pc->ops->destroy        = PCDestroy_VPBJacobi_CUDA;
1883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
189f1be3500SJunchao Zhang }
190