xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 4e58db63d9a16e1e6d7a6b59b385108b2e578961)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt  *colidx; /* column index */
10   MatScalar *val;
11   PetscInt  *sliidx;
12   PetscInt   nonzerostate;
13   PetscInt   kernelchoice;
14   PetscInt   blocky;
15 } Mat_SeqSELLCUDA;
16 
17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
18 {
19   PetscFunctionBegin;
20   if (*cudastruct) {
21     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
22     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
23     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
24     PetscCall(PetscFree(*cudastruct));
25   }
26   PetscFunctionReturn(PETSC_SUCCESS);
27 }
28 
29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
30 {
31   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
32   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
33 
34   PetscFunctionBegin;
35   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
36     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
37     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
38       /* copy values only */
39       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
40       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
41     } else {
42       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
43       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
44       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
45       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
46       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
47       /* copy values, nz or maxallocmat? */
48       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
49       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
50 
51       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
52       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
53       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
54       cudastruct->nonzerostate = A->nonzerostate;
55     }
56     PetscCallCUDA(WaitForCUDA());
57     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
58     A->offloadmask = PETSC_OFFLOAD_BOTH;
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
64 {
65   PetscInt  i, row, slice_id, row_in_slice;
66   MatScalar sum;
67   /* one thread per row. */
68   row = blockIdx.x * blockDim.x + threadIdx.x;
69   if (row < nrows) {
70     slice_id     = row / sliceheight;
71     row_in_slice = row % sliceheight;
72     sum          = 0.0;
73     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
74     y[row] = sum;
75   }
76 }
77 
78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
79 {
80   PetscInt  i, row, slice_id, row_in_slice;
81   MatScalar sum;
82   /* one thread per row. */
83   row = blockIdx.x * blockDim.x + threadIdx.x;
84   if (row < nrows) {
85     slice_id     = row / sliceheight;
86     row_in_slice = row % sliceheight;
87     sum          = 0.0;
88     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89     z[row] = y[row] + sum;
90   }
91 }
92 
93 /* use 1 block per slice, suitable for large slice width */
94 template <int BLOCKY>
95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
96 {
97   __shared__ MatScalar shared[32][BLOCKY];
98   PetscInt             i, row, slice_id = blockIdx.x;
99   int                  tid = threadIdx.x + threadIdx.y * 32;
100   /* transposed index */
101   int         tidx = tid % BLOCKY;
102   int         tidy = tid / BLOCKY;
103   PetscScalar t    = 0.0;
104 
105   row = slice_id * sliceheight + threadIdx.x % sliceheight;
106   if (row < nrows) {
107     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
108   }
109 #pragma unroll
110   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
111   /* transpose layout to reduce each row using warp shfl */
112   shared[threadIdx.x][threadIdx.y] = t;
113   __syncthreads();
114   t = shared[tidy][tidx];
115 #pragma unroll
116   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
117   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
118   __syncthreads();
119   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
120 }
121 
122 /* use 1 warp per slice, suitable for small slice width */
123 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
124 {
125   PetscInt i, row, slice_id;
126   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
127   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
128   double t = 0.0;
129   if (row < nrows) {
130     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
131   }
132 #pragma unroll
133   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
134   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
135 }
136 
137 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
138 {
139   __shared__ MatScalar shared[512];
140   PetscInt             i, row, slice_id, row_in_slice;
141   /* multiple threads per row. */
142   row = blockIdx.x * blockDim.x + threadIdx.x;
143   if (row < nrows) {
144     slice_id     = row / SLICE_HEIGHT;
145     row_in_slice = row % SLICE_HEIGHT;
146 
147     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
148     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
149     __syncthreads();
150     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
151     __syncthreads();
152     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
153     __syncthreads();
154     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
155     __syncthreads();
156     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
157     __syncthreads();
158     if (threadIdx.y < 1) {
159       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
160       y[row] = shared[threadIdx.x];
161     }
162   }
163 }
164 
165 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
166 {
167   __shared__ MatScalar shared[512];
168   PetscInt             i, row, slice_id, row_in_slice;
169   /* multiple threads per row. */
170   row = blockIdx.x * blockDim.x + threadIdx.x;
171   if (row < nrows) {
172     slice_id     = row / SLICE_HEIGHT;
173     row_in_slice = row % SLICE_HEIGHT;
174 
175     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
176     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
177     __syncthreads();
178     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
179     __syncthreads();
180     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
181     __syncthreads();
182     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
183     __syncthreads();
184     if (threadIdx.y < 1) {
185       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
186       y[row] = shared[threadIdx.x];
187     }
188   }
189 }
190 
191 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
192 {
193   __shared__ MatScalar shared[512];
194   PetscInt             i, row, slice_id, row_in_slice;
195   /* multiple threads per row. */
196   row = blockIdx.x * blockDim.x + threadIdx.x;
197   if (row < nrows) {
198     slice_id     = row / SLICE_HEIGHT;
199     row_in_slice = row % SLICE_HEIGHT;
200 
201     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
202     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
203     __syncthreads();
204     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
205     __syncthreads();
206     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
207     __syncthreads();
208     if (threadIdx.y < 1) {
209       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
210       y[row] = shared[threadIdx.x];
211     }
212   }
213 }
214 
215 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
216 {
217   __shared__ MatScalar shared[512];
218   PetscInt             i, row, slice_id, row_in_slice;
219   /* multiple threads per row. */
220   row = blockIdx.x * blockDim.x + threadIdx.x;
221   if (row < nrows) {
222     slice_id     = row / SLICE_HEIGHT;
223     row_in_slice = row % SLICE_HEIGHT;
224 
225     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
226     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
227     __syncthreads();
228     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
229     __syncthreads();
230     if (threadIdx.y < 1) {
231       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
232       y[row] = shared[threadIdx.x];
233     }
234   }
235 }
236 
237 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
238 {
239   __shared__ MatScalar shared[512];
240   PetscInt             i, row, slice_id, row_in_slice;
241   /* multiple threads per row. */
242   row = blockIdx.x * blockDim.x + threadIdx.x;
243   if (row < nrows) {
244     slice_id     = row / SLICE_HEIGHT;
245     row_in_slice = row % SLICE_HEIGHT;
246 
247     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
248     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
249     __syncthreads();
250     if (threadIdx.y < 1) {
251       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
252       y[row] = shared[threadIdx.x];
253     }
254   }
255 }
256 
257 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
258 {
259   __shared__ MatScalar shared[512];
260   PetscInt             i, row, slice_id, row_in_slice;
261   /* multiple threads per row. */
262   row = blockIdx.x * blockDim.x + threadIdx.x;
263   if (row < nrows) {
264     slice_id     = row / SLICE_HEIGHT;
265     row_in_slice = row % SLICE_HEIGHT;
266 
267     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
268     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
269     __syncthreads();
270     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
271     __syncthreads();
272     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
273     __syncthreads();
274     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
275     __syncthreads();
276     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
277     __syncthreads();
278     if (threadIdx.y < 1) {
279       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
280       z[row] = y[row] + shared[threadIdx.x];
281     }
282   }
283 }
284 
285 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
286 {
287   __shared__ MatScalar shared[512];
288   PetscInt             i, row, slice_id, row_in_slice;
289   /* multiple threads per row. */
290   row = blockIdx.x * blockDim.x + threadIdx.x;
291   if (row < nrows) {
292     slice_id     = row / SLICE_HEIGHT;
293     row_in_slice = row % SLICE_HEIGHT;
294 
295     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
296     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
297     __syncthreads();
298     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
299     __syncthreads();
300     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
301     __syncthreads();
302     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
303     __syncthreads();
304     if (threadIdx.y < 1) {
305       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
306       z[row] = y[row] + shared[threadIdx.x];
307     }
308   }
309 }
310 
311 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
312 {
313   __shared__ MatScalar shared[512];
314   PetscInt             i, row, slice_id, row_in_slice;
315   /* multiple threads per row. */
316   row = blockIdx.x * blockDim.x + threadIdx.x;
317   if (row < nrows) {
318     slice_id     = row / SLICE_HEIGHT;
319     row_in_slice = row % SLICE_HEIGHT;
320 
321     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
322     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
323     __syncthreads();
324     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
325     __syncthreads();
326     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
327     __syncthreads();
328     if (threadIdx.y < 1) {
329       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
330       z[row] = y[row] + shared[threadIdx.x];
331     }
332   }
333 }
334 
335 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
336 {
337   __shared__ MatScalar shared[512];
338   PetscInt             i, row, slice_id, row_in_slice;
339   /* multiple threads per row. */
340   row = blockIdx.x * blockDim.x + threadIdx.x;
341   if (row < nrows) {
342     slice_id     = row / SLICE_HEIGHT;
343     row_in_slice = row % SLICE_HEIGHT;
344 
345     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
346     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
347     __syncthreads();
348     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
349     __syncthreads();
350     if (threadIdx.y < 1) {
351       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
352       z[row] = y[row] + shared[threadIdx.x];
353     }
354   }
355 }
356 
357 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
358 {
359   __shared__ MatScalar shared[512];
360   PetscInt             i, row, slice_id, row_in_slice;
361   /* multiple threads per row. */
362   row = blockIdx.x * blockDim.x + threadIdx.x;
363   if (row < nrows) {
364     slice_id     = row / SLICE_HEIGHT;
365     row_in_slice = row % SLICE_HEIGHT;
366 
367     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
368     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
369     __syncthreads();
370     if (threadIdx.y < 1) {
371       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
372       z[row] = y[row] + shared[threadIdx.x];
373     }
374   }
375 }
376 
377 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
378 {
379   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
380   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
381   PetscScalar       *y;
382   const PetscScalar *x;
383   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n, sliceheight = a->sliceheight;
384   MatScalar         *aval;
385   PetscInt          *acolidx;
386   PetscInt          *sliidx;
387   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
388   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
389 
390   PetscFunctionBegin;
391   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
392   PetscCall(MatSeqSELLCUDACopyToGPU(A));
393   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
394   aval    = cudastruct->val;
395   acolidx = cudastruct->colidx;
396   sliidx  = cudastruct->sliidx;
397 
398   PetscCall(VecCUDAGetArrayRead(xx, &x));
399   PetscCall(VecCUDAGetArrayWrite(yy, &y));
400   PetscCall(PetscLogGpuTimeBegin());
401 
402   switch (cudastruct->kernelchoice) {
403   case 9:
404     nblocks = 1 + (nrows - 1) / sliceheight;
405     if (cudastruct->blocky == 2) {
406       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
407     } else if (cudastruct->blocky == 4) {
408       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
409     } else if (cudastruct->blocky == 8) {
410       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
411     } else if (cudastruct->blocky == 16) {
412       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
413     } else if (cudastruct->blocky == 32) {
414       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
415     }
416     break;
417   case 7:
418     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
419     if (cudastruct->blocky == 2) {
420       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
421     } else if (cudastruct->blocky == 4) {
422       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
423     } else if (cudastruct->blocky == 8) {
424       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
425     } else if (cudastruct->blocky == 16) {
426       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
427     } else if (cudastruct->blocky == 32) {
428       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
429     } else {
430       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
431     }
432     break;
433   case 6:
434     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
435     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
436     break;
437   case 5:
438     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
439     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
440     break;
441   case 4:
442     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
443     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
444     break;
445   case 3:
446     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
447     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
448     break;
449   case 2: /* 16 slices per block if blocksize=512 */
450     nblocks = 1 + (nrows - 1) / (blocksize / 2);
451     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
452     break;
453   case 1: /* 32 slices per block if blocksize=512 */
454     nblocks = 1 + (nrows - 1) / blocksize;
455     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
456     break;
457   case 0:
458     if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */
459       nblocks = 1 + (nrows - 1) / sliceheight;
460       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
461     } else {
462       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
463       if (avgslicesize <= 96) {
464         nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
465         matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
466       } else if (avgslicesize <= 432) {
467         nblocks = 1 + (nrows - 1) / sliceheight;
468         matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
469       } else if (avgslicesize <= 2400) {
470         nblocks = 1 + (nrows - 1) / sliceheight;
471         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
472       } else {
473         nblocks = 1 + (nrows - 1) / sliceheight;
474         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
475       }
476     }
477     break;
478   }
479   PetscCall(PetscLogGpuTimeEnd());
480   PetscCall(VecCUDARestoreArrayRead(xx, &x));
481   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
482   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
483   PetscFunctionReturn(PETSC_SUCCESS);
484 }
485 
486 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
487 {
488   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
489   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
490   PetscScalar       *z;
491   const PetscScalar *y, *x;
492   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n, sliceheight = a->sliceheight;
493   MatScalar         *aval    = cudastruct->val;
494   PetscInt          *acolidx = cudastruct->colidx;
495   PetscInt          *sliidx  = cudastruct->sliidx;
496 
497   PetscFunctionBegin;
498   PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
499   PetscCall(MatSeqSELLCUDACopyToGPU(A));
500   if (a->nz) {
501     PetscInt nblocks, blocksize = 512;
502     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
503     PetscCall(VecCUDAGetArrayRead(xx, &x));
504     PetscCall(VecCUDAGetArrayRead(yy, &y));
505     PetscCall(VecCUDAGetArrayWrite(zz, &z));
506     PetscCall(PetscLogGpuTimeBegin());
507 
508     switch (cudastruct->kernelchoice) {
509     case 6:
510       nblocks = 1 + (nrows - 1) / (blocksize / 32);
511       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
512       break;
513     case 5:
514       nblocks = 1 + (nrows - 1) / (blocksize / 16);
515       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
516       break;
517     case 4:
518       nblocks = 1 + (nrows - 1) / (blocksize / 8);
519       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
520       break;
521     case 3:
522       nblocks = 1 + (nrows - 1) / (blocksize / 4);
523       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
524       break;
525     case 2:
526       nblocks = 1 + (nrows - 1) / (blocksize / 2);
527       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
528       break;
529     case 1:
530       nblocks = 1 + (nrows - 1) / blocksize;
531       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
532       break;
533     case 0: /* TODO */
534       break;
535     }
536     PetscCall(PetscLogGpuTimeEnd());
537     PetscCall(VecCUDARestoreArrayRead(xx, &x));
538     PetscCall(VecCUDARestoreArrayRead(yy, &y));
539     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
540     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
541   } else {
542     PetscCall(VecCopy(yy, zz));
543   }
544   PetscFunctionReturn(PETSC_SUCCESS);
545 }
546 
547 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
548 {
549   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
550   PetscInt         kernel, blocky;
551   PetscBool        flg;
552 
553   PetscFunctionBegin;
554   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
555   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
556   if (flg) {
557     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
558     cudastruct->kernelchoice = kernel;
559   }
560   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
561   if (flg) {
562     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel);
563     cudastruct->blocky = blocky;
564   }
565   PetscOptionsHeadEnd();
566   PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 
569 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
570 {
571   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
572 
573   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
574   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
575   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
576   PetscFunctionReturn(PETSC_SUCCESS);
577 }
578 
579 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
580 {
581   PetscFunctionBegin;
582   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
583   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
584   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
585   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
586   A->ops->mult    = MatMult_SeqSELLCUDA;
587   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
592 {
593   PetscFunctionBegin;
594   if (A->factortype == MAT_FACTOR_NONE) {
595     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
596   }
597   PetscCall(MatDestroy_SeqSELL(A));
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
601 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
602 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
603 {
604   PetscFunctionBegin;
605   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
606   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
607   PetscFunctionReturn(PETSC_SUCCESS);
608 }
609 
610 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
611 {
612   Mat_SeqSELLCUDA *cudastruct;
613 
614   PetscFunctionBegin;
615   PetscCall(PetscFree(B->defaultvectype));
616   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
617 
618   if (!B->spptr) {
619     if (B->factortype == MAT_FACTOR_NONE) {
620       PetscCall(PetscNew(&cudastruct));
621       B->spptr = cudastruct;
622     }
623   }
624 
625   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
626   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
627   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
628   B->ops->mult           = MatMult_SeqSELLCUDA;
629   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
630   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
631 
632   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
633   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
634 
635   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
636   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
637   PetscFunctionReturn(PETSC_SUCCESS);
638 }
639 
640 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
641 {
642   PetscFunctionBegin;
643   PetscCall(MatCreate_SeqSELL(B));
644   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
645   PetscFunctionReturn(PETSC_SUCCESS);
646 }
647