xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 2d1451d43b73a0495cd81c074cbc1e0206888947)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 typedef struct {
7   PetscInt  *colidx; /* column index */
8   MatScalar *val;
9   PetscInt  *sliidx;
10   PetscInt   nonzerostate;
11 } Mat_SeqSELLCUDA;
12 
13 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
14 {
15   PetscFunctionBegin;
16   if (*cudastruct) {
17     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
18     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
19     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
20     PetscCall(PetscFree(*cudastruct));
21   }
22   PetscFunctionReturn(PETSC_SUCCESS);
23 }
24 
25 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
26 {
27   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
28   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
29 
30   PetscFunctionBegin;
31   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
32     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
33     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
34       /* copy values only */
35       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
36       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
37     } else {
38       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
39       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
40       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
41       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
42       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
43       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
44       /* copy values, nz or maxallocmat? */
45       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
46       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
47       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
48       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
49       cudastruct->nonzerostate = A->nonzerostate;
50     }
51     PetscCallCUDA(WaitForCUDA());
52     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
53     A->offloadmask = PETSC_OFFLOAD_BOTH;
54   }
55   PetscFunctionReturn(PETSC_SUCCESS);
56 }
57 
58 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
59 {
60   PetscInt  i, row, slice_id, row_in_slice;
61   MatScalar sum;
62   /* one thread per row. */
63   row = blockIdx.x * blockDim.x + threadIdx.x;
64   if (row < nrows) {
65     slice_id     = row / SLICE_HEIGHT;
66     row_in_slice = row % SLICE_HEIGHT;
67     if (slice_id < totalslices) {
68       sum = 0.0;
69       for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
70       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
71         if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = sum;
72       } else {
73         y[row] = sum;
74       }
75     }
76   }
77 }
78 
79 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
80 {
81   PetscInt  i, row, slice_id, row_in_slice;
82   MatScalar sum;
83   /* one thread per row. */
84   row = blockIdx.x * blockDim.x + threadIdx.x;
85   if (row < nrows) {
86     slice_id     = row / SLICE_HEIGHT;
87     row_in_slice = row % SLICE_HEIGHT;
88     if (slice_id < totalslices) {
89       sum = 0.0;
90       for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
91       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
92         if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + sum;
93       } else {
94         z[row] = y[row] + sum;
95       }
96     }
97   }
98 }
99 
100 __global__ void matmult_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
101 {
102   __shared__ MatScalar shared[256];
103   PetscInt             i, row, slice_id, row_in_slice;
104   /* one thread per row. */
105   row = blockIdx.x * blockDim.x + threadIdx.x;
106   if (row < nrows) {
107     slice_id     = row / SLICE_HEIGHT;
108     row_in_slice = row % SLICE_HEIGHT;
109 
110     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
111     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
112     if (blockDim.y > 4) {
113       __syncthreads();
114       if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
115     }
116     if (blockDim.y > 2) {
117       __syncthreads();
118       if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
119     }
120     if (blockDim.y > 1) {
121       __syncthreads();
122       if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; }
123     }
124     if (threadIdx.y < 1) {
125       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
126         if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = shared[threadIdx.x];
127       } else {
128         y[row] = shared[threadIdx.x];
129       }
130     }
131   }
132 }
133 
134 __global__ void matmultadd_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
135 {
136   __shared__ MatScalar shared[256];
137   PetscInt             i, row, slice_id, row_in_slice;
138   /* one thread per row. */
139   row = blockIdx.x * blockDim.x + threadIdx.x;
140   if (row < nrows) {
141     slice_id     = row / SLICE_HEIGHT;
142     row_in_slice = row % SLICE_HEIGHT;
143 
144     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
145     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
146     if (blockDim.y > 4) {
147       __syncthreads();
148       if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
149     }
150     if (blockDim.y > 2) {
151       __syncthreads();
152       if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
153     }
154     if (blockDim.y > 1) {
155       __syncthreads();
156       if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; }
157     }
158     if (threadIdx.y < 1) {
159       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
160         if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + shared[threadIdx.x];
161       } else {
162         z[row] = y[row] + shared[threadIdx.x];
163       }
164     }
165   }
166 }
167 
168 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
169 {
170   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
171   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
172   PetscScalar       *y;
173   const PetscScalar *x;
174   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
175   MatScalar         *aval;
176   PetscInt          *acolidx;
177   PetscInt          *sliidx;
178   PetscInt           nblocks, blocksize = 256;
179 
180   PetscFunctionBegin;
181   PetscCall(MatSeqSELLCUDACopyToGPU(A));
182   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
183   aval    = cudastruct->val;
184   acolidx = cudastruct->colidx;
185   sliidx  = cudastruct->sliidx;
186 
187   PetscCall(VecCUDAGetArrayRead(xx, &x));
188   PetscCall(VecCUDAGetArrayWrite(yy, &y));
189   PetscCall(PetscLogGpuTimeBegin());
190   nblocks = (nrows + blocksize - 1) / blocksize;
191   if (nblocks >= 80) {
192     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
193   } else {
194     PetscInt avg_width;
195     dim3     block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8);
196     avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices);
197     if (avg_width > 64) {
198       matmult_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
199     } else if (avg_width > 32) {
200       matmult_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
201     } else if (avg_width > 16) {
202       matmult_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
203     } else {
204       matmult_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
205     }
206   }
207   PetscCallCUDA(WaitForCUDA());
208   PetscCall(PetscLogGpuTimeEnd());
209   PetscCall(VecCUDARestoreArrayRead(xx, &x));
210   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
211   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
216 {
217   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
218   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
219   PetscScalar       *z;
220   const PetscScalar *y, *x;
221   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
222   MatScalar         *aval    = cudastruct->val;
223   PetscInt          *acolidx = cudastruct->colidx;
224   PetscInt          *sliidx  = cudastruct->sliidx;
225 
226   PetscFunctionBegin;
227   PetscCall(MatSeqSELLCUDACopyToGPU(A));
228   if (a->nz) {
229     PetscInt nblocks, blocksize = 256;
230     PetscCall(VecCUDAGetArrayRead(xx, &x));
231     PetscCall(VecCUDAGetArrayRead(yy, &y));
232     PetscCall(VecCUDAGetArrayWrite(zz, &z));
233     PetscCall(PetscLogGpuTimeBegin());
234     nblocks = (nrows + blocksize - 1) / blocksize;
235     if (nblocks >= 80) {
236       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
237     } else {
238       PetscInt avg_width;
239       dim3     block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8);
240       avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices);
241       if (avg_width > 64) {
242         matmultadd_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
243       } else if (avg_width > 32) {
244         matmultadd_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
245       } else if (avg_width > 16) {
246         matmultadd_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
247       } else {
248         matmultadd_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
249       }
250     }
251     PetscCallCUDA(WaitForCUDA());
252     PetscCall(PetscLogGpuTimeEnd());
253     PetscCall(VecCUDARestoreArrayRead(xx, &x));
254     PetscCall(VecCUDARestoreArrayRead(yy, &y));
255     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
256     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
257   } else {
258     PetscCall(VecCopy(yy, zz));
259   }
260   PetscFunctionReturn(PETSC_SUCCESS);
261 }
262 
263 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
264 {
265   PetscFunctionBegin;
266   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
267   PetscOptionsHeadEnd();
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
272 {
273   PetscFunctionBegin;
274   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
275   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
276   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
277   A->ops->mult    = MatMult_SeqSELLCUDA;
278   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
279   PetscFunctionReturn(PETSC_SUCCESS);
280 }
281 
282 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
283 {
284   PetscFunctionBegin;
285   if (A->factortype == MAT_FACTOR_NONE) {
286     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
287   }
288   PetscCall(MatDestroy_SeqSELL(A));
289   PetscFunctionReturn(PETSC_SUCCESS);
290 }
291 
292 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
293 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
294 {
295   PetscFunctionBegin;
296   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
297   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
302 {
303   Mat_SeqSELLCUDA *cudastruct;
304 
305   PetscFunctionBegin;
306   PetscCall(PetscFree(B->defaultvectype));
307   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
308 
309   if (!B->spptr) {
310     if (B->factortype == MAT_FACTOR_NONE) {
311       PetscCall(PetscNew(&cudastruct));
312       B->spptr = cudastruct;
313     }
314   }
315 
316   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
317   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
318   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
319   B->ops->mult           = MatMult_SeqSELLCUDA;
320   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
321   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
322 
323   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
324   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
325   PetscFunctionReturn(PETSC_SUCCESS);
326 }
327 
328 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
329 {
330   PetscFunctionBegin;
331   PetscCall(MatCreate_SeqSELL(B));
332   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335