xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision a9dd396c8c55b7f61751def2566f73b25b75cc75)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt  *colidx; /* column index */
10   MatScalar *val;
11   PetscInt  *sliidx;
12   PetscInt   nonzerostate;
13   PetscInt   kernelchoice;
14   PetscInt   blocky;
15 } Mat_SeqSELLCUDA;
16 
17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
18 {
19   PetscFunctionBegin;
20   if (*cudastruct) {
21     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
22     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
23     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
24     PetscCall(PetscFree(*cudastruct));
25   }
26   PetscFunctionReturn(PETSC_SUCCESS);
27 }
28 
29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
30 {
31   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
32   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
33 
34   PetscFunctionBegin;
35   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
36     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
37     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
38       /* copy values only */
39       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
40       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
41     } else {
42       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
43       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
44       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
45       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
46       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
47       /* copy values, nz or maxallocmat? */
48       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
49       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
50 
51       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
52       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
53       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
54       cudastruct->nonzerostate = A->nonzerostate;
55     }
56     PetscCallCUDA(WaitForCUDA());
57     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
58     A->offloadmask = PETSC_OFFLOAD_BOTH;
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
64 {
65   PetscInt  i, row, slice_id, row_in_slice;
66   MatScalar sum;
67   /* one thread per row. */
68   row = blockIdx.x * blockDim.x + threadIdx.x;
69   if (row < nrows) {
70     slice_id     = row / sliceheight;
71     row_in_slice = row % sliceheight;
72     sum          = 0.0;
73     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
74     y[row] = sum;
75   }
76 }
77 
78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
79 {
80   PetscInt  i, row, slice_id, row_in_slice;
81   MatScalar sum;
82   /* one thread per row. */
83   row = blockIdx.x * blockDim.x + threadIdx.x;
84   if (row < nrows) {
85     slice_id     = row / sliceheight;
86     row_in_slice = row % sliceheight;
87     sum          = 0.0;
88     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89     z[row] = y[row] + sum;
90   }
91 }
92 
93 /* use 1 block per slice, suitable for large slice width */
94 template <int BLOCKY>
95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
96 {
97   __shared__ MatScalar shared[32][BLOCKY];
98   PetscInt             i, row, slice_id = blockIdx.x;
99   int                  tid = threadIdx.x + threadIdx.y * 32;
100   /* transposed index */
101   int         tidx = tid % BLOCKY;
102   int         tidy = tid / BLOCKY;
103   PetscScalar t    = 0.0;
104 
105   row = slice_id * sliceheight + threadIdx.x % sliceheight;
106   if (row < nrows) {
107     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
108   }
109 #pragma unroll
110   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
111   /* transpose layout to reduce each row using warp shfl */
112   shared[threadIdx.x][threadIdx.y] = t;
113   __syncthreads();
114   t = shared[tidy][tidx];
115 #pragma unroll
116   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
117   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
118   __syncthreads();
119   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
120 }
121 
122 /* use 1 warp per slice, suitable for small slice width */
123 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
124 {
125   PetscInt i, row, slice_id;
126   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
127   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
128   double t = 0.0;
129   if (row < nrows) {
130     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
131   }
132 #pragma unroll
133   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
134   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
135 }
136 
137 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
138 
139 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
140 {
141   __shared__ MatScalar shared[512];
142   PetscInt             i, row, slice_id, row_in_slice;
143   /* multiple threads per row. */
144   row = blockIdx.x * blockDim.x + threadIdx.x;
145   if (row < nrows) {
146     slice_id     = row / SLICE_HEIGHT;
147     row_in_slice = row % SLICE_HEIGHT;
148 
149     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
150     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
151     __syncthreads();
152     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
153     __syncthreads();
154     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
155     __syncthreads();
156     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
157     __syncthreads();
158     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
159     __syncthreads();
160     if (threadIdx.y < 1) {
161       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
162       y[row] = shared[threadIdx.x];
163     }
164   }
165 }
166 
167 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
168 {
169   __shared__ MatScalar shared[512];
170   PetscInt             i, row, slice_id, row_in_slice;
171   /* multiple threads per row. */
172   row = blockIdx.x * blockDim.x + threadIdx.x;
173   if (row < nrows) {
174     slice_id     = row / SLICE_HEIGHT;
175     row_in_slice = row % SLICE_HEIGHT;
176 
177     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
178     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
179     __syncthreads();
180     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
181     __syncthreads();
182     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
183     __syncthreads();
184     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
185     __syncthreads();
186     if (threadIdx.y < 1) {
187       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
188       y[row] = shared[threadIdx.x];
189     }
190   }
191 }
192 
193 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
194 {
195   __shared__ MatScalar shared[512];
196   PetscInt             i, row, slice_id, row_in_slice;
197   /* multiple threads per row. */
198   row = blockIdx.x * blockDim.x + threadIdx.x;
199   if (row < nrows) {
200     slice_id     = row / SLICE_HEIGHT;
201     row_in_slice = row % SLICE_HEIGHT;
202 
203     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
204     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
205     __syncthreads();
206     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
207     __syncthreads();
208     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
209     __syncthreads();
210     if (threadIdx.y < 1) {
211       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
212       y[row] = shared[threadIdx.x];
213     }
214   }
215 }
216 
217 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
218 {
219   __shared__ MatScalar shared[512];
220   PetscInt             i, row, slice_id, row_in_slice;
221   /* multiple threads per row. */
222   row = blockIdx.x * blockDim.x + threadIdx.x;
223   if (row < nrows) {
224     slice_id     = row / SLICE_HEIGHT;
225     row_in_slice = row % SLICE_HEIGHT;
226 
227     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
228     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
229     __syncthreads();
230     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
231     __syncthreads();
232     if (threadIdx.y < 1) {
233       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
234       y[row] = shared[threadIdx.x];
235     }
236   }
237 }
238 
239 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
240 {
241   __shared__ MatScalar shared[512];
242   PetscInt             i, row, slice_id, row_in_slice;
243   /* multiple threads per row. */
244   row = blockIdx.x * blockDim.x + threadIdx.x;
245   if (row < nrows) {
246     slice_id     = row / SLICE_HEIGHT;
247     row_in_slice = row % SLICE_HEIGHT;
248 
249     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
250     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
251     __syncthreads();
252     if (threadIdx.y < 1) {
253       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
254       y[row] = shared[threadIdx.x];
255     }
256   }
257 }
258 
259 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
260 {
261   __shared__ MatScalar shared[512];
262   PetscInt             i, row, slice_id, row_in_slice;
263   /* multiple threads per row. */
264   row = blockIdx.x * blockDim.x + threadIdx.x;
265   if (row < nrows) {
266     slice_id     = row / SLICE_HEIGHT;
267     row_in_slice = row % SLICE_HEIGHT;
268 
269     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
270     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
271     __syncthreads();
272     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
273     __syncthreads();
274     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
275     __syncthreads();
276     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
277     __syncthreads();
278     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
279     __syncthreads();
280     if (threadIdx.y < 1) {
281       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
282       z[row] = y[row] + shared[threadIdx.x];
283     }
284   }
285 }
286 
287 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
288 {
289   __shared__ MatScalar shared[512];
290   PetscInt             i, row, slice_id, row_in_slice;
291   /* multiple threads per row. */
292   row = blockIdx.x * blockDim.x + threadIdx.x;
293   if (row < nrows) {
294     slice_id     = row / SLICE_HEIGHT;
295     row_in_slice = row % SLICE_HEIGHT;
296 
297     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
298     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
299     __syncthreads();
300     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
301     __syncthreads();
302     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
303     __syncthreads();
304     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
305     __syncthreads();
306     if (threadIdx.y < 1) {
307       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
308       z[row] = y[row] + shared[threadIdx.x];
309     }
310   }
311 }
312 
313 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314 {
315   __shared__ MatScalar shared[512];
316   PetscInt             i, row, slice_id, row_in_slice;
317   /* multiple threads per row. */
318   row = blockIdx.x * blockDim.x + threadIdx.x;
319   if (row < nrows) {
320     slice_id     = row / SLICE_HEIGHT;
321     row_in_slice = row % SLICE_HEIGHT;
322 
323     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
324     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
325     __syncthreads();
326     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
327     __syncthreads();
328     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
329     __syncthreads();
330     if (threadIdx.y < 1) {
331       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
332       z[row] = y[row] + shared[threadIdx.x];
333     }
334   }
335 }
336 
337 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
338 {
339   __shared__ MatScalar shared[512];
340   PetscInt             i, row, slice_id, row_in_slice;
341   /* multiple threads per row. */
342   row = blockIdx.x * blockDim.x + threadIdx.x;
343   if (row < nrows) {
344     slice_id     = row / SLICE_HEIGHT;
345     row_in_slice = row % SLICE_HEIGHT;
346 
347     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
348     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
349     __syncthreads();
350     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
351     __syncthreads();
352     if (threadIdx.y < 1) {
353       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354       z[row] = y[row] + shared[threadIdx.x];
355     }
356   }
357 }
358 
359 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
360 {
361   __shared__ MatScalar shared[512];
362   PetscInt             i, row, slice_id, row_in_slice;
363   /* multiple threads per row. */
364   row = blockIdx.x * blockDim.x + threadIdx.x;
365   if (row < nrows) {
366     slice_id     = row / SLICE_HEIGHT;
367     row_in_slice = row % SLICE_HEIGHT;
368 
369     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371     __syncthreads();
372     if (threadIdx.y < 1) {
373       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
374       z[row] = y[row] + shared[threadIdx.x];
375     }
376   }
377 }
378 
379 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
380 {
381   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
382   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
383   PetscScalar       *y;
384   const PetscScalar *x;
385   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
386   MatScalar         *aval;
387   PetscInt          *acolidx;
388   PetscInt          *sliidx;
389   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
390   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
391 
392   PetscFunctionBegin;
393   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
394   PetscCall(MatSeqSELLCUDACopyToGPU(A));
395   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
396   aval    = cudastruct->val;
397   acolidx = cudastruct->colidx;
398   sliidx  = cudastruct->sliidx;
399 
400   PetscCall(VecCUDAGetArrayRead(xx, &x));
401   PetscCall(VecCUDAGetArrayWrite(yy, &y));
402   PetscCall(PetscLogGpuTimeBegin());
403 
404   switch (cudastruct->kernelchoice) {
405   case 9:
406     nblocks = 1 + (nrows - 1) / sliceheight;
407     if (cudastruct->blocky == 2) {
408       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
409     } else if (cudastruct->blocky == 4) {
410       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
411     } else if (cudastruct->blocky == 8) {
412       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
413     } else if (cudastruct->blocky == 16) {
414       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
415     } else if (cudastruct->blocky == 32) {
416       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
417     }
418     break;
419   case 7:
420     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
421     if (cudastruct->blocky == 2) {
422       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
423     } else if (cudastruct->blocky == 4) {
424       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
425     } else if (cudastruct->blocky == 8) {
426       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
427     } else if (cudastruct->blocky == 16) {
428       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
429     } else if (cudastruct->blocky == 32) {
430       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
431     } else {
432       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
433     }
434     break;
435   case 6:
436     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
437     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
438     break;
439   case 5:
440     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
441     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
442     break;
443   case 4:
444     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
445     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
446     break;
447   case 3:
448     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
449     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
450     break;
451   case 2: /* 16 slices per block if blocksize=512 */
452     nblocks = 1 + (nrows - 1) / (blocksize / 2);
453     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
454     break;
455   case 1: /* 32 slices per block if blocksize=512 */
456     nblocks = 1 + (nrows - 1) / blocksize;
457     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
458     break;
459   case 0:
460     if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */
461       nblocks = 1 + (nrows - 1) / sliceheight;
462       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
463     } else {
464       nblocks = 1 + (nrows - 1) / sliceheight;
465       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
466     }
467   }
468   PetscCall(PetscLogGpuTimeEnd());
469   PetscCall(VecCUDARestoreArrayRead(xx, &x));
470   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
471   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
472   PetscFunctionReturn(PETSC_SUCCESS);
473 }
474 
475 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
476 {
477   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
478   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
479   PetscScalar       *z;
480   const PetscScalar *y, *x;
481   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
482   MatScalar         *aval    = cudastruct->val;
483   PetscInt          *acolidx = cudastruct->colidx;
484   PetscInt          *sliidx  = cudastruct->sliidx;
485 
486   PetscFunctionBegin;
487   PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
488   PetscCall(MatSeqSELLCUDACopyToGPU(A));
489   if (a->nz) {
490     PetscInt nblocks, blocksize = 512;
491     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
492     PetscCall(VecCUDAGetArrayRead(xx, &x));
493     PetscCall(VecCUDAGetArrayRead(yy, &y));
494     PetscCall(VecCUDAGetArrayWrite(zz, &z));
495     PetscCall(PetscLogGpuTimeBegin());
496 
497     switch (cudastruct->kernelchoice) {
498     case 6:
499       nblocks = 1 + (nrows - 1) / (blocksize / 32);
500       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
501       break;
502     case 5:
503       nblocks = 1 + (nrows - 1) / (blocksize / 16);
504       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
505       break;
506     case 4:
507       nblocks = 1 + (nrows - 1) / (blocksize / 8);
508       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
509       break;
510     case 3:
511       nblocks = 1 + (nrows - 1) / (blocksize / 4);
512       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
513       break;
514     case 2:
515       nblocks = 1 + (nrows - 1) / (blocksize / 2);
516       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
517       break;
518     case 1:
519       nblocks = 1 + (nrows - 1) / blocksize;
520       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
521       break;
522     case 0: /* TODO */
523       break;
524     }
525     PetscCall(PetscLogGpuTimeEnd());
526     PetscCall(VecCUDARestoreArrayRead(xx, &x));
527     PetscCall(VecCUDARestoreArrayRead(yy, &y));
528     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
529     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
530   } else {
531     PetscCall(VecCopy(yy, zz));
532   }
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
537 {
538   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
539   PetscInt         kernel, blocky;
540   PetscBool        flg;
541 
542   PetscFunctionBegin;
543   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
544   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
545   if (flg) {
546     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
547     cudastruct->kernelchoice = kernel;
548   }
549   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
550   if (flg) {
551     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel);
552     cudastruct->blocky = blocky;
553   }
554   PetscOptionsHeadEnd();
555   PetscFunctionReturn(PETSC_SUCCESS);
556 }
557 
558 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
559 {
560   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
561 
562   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
563   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
564   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
569 {
570   PetscFunctionBegin;
571   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
572   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
573   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
574   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
575   A->ops->mult    = MatMult_SeqSELLCUDA;
576   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
577   PetscFunctionReturn(PETSC_SUCCESS);
578 }
579 
580 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
581 {
582   PetscFunctionBegin;
583   if (A->factortype == MAT_FACTOR_NONE) {
584     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
585   }
586   PetscCall(MatDestroy_SeqSELL(A));
587   PetscFunctionReturn(PETSC_SUCCESS);
588 }
589 
590 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
591 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
592 {
593   PetscFunctionBegin;
594   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
595   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
596   PetscFunctionReturn(PETSC_SUCCESS);
597 }
598 
599 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
600 {
601   Mat_SeqSELLCUDA *cudastruct;
602 
603   PetscFunctionBegin;
604   PetscCall(PetscFree(B->defaultvectype));
605   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
606 
607   if (!B->spptr) {
608     if (B->factortype == MAT_FACTOR_NONE) {
609       PetscCall(PetscNew(&cudastruct));
610       B->spptr = cudastruct;
611     }
612   }
613 
614   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
615   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
616   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
617   B->ops->mult           = MatMult_SeqSELLCUDA;
618   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
619   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
620 
621   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
622   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
623 
624   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
625   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
626   PetscFunctionReturn(PETSC_SUCCESS);
627 }
628 
629 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
630 {
631   PetscFunctionBegin;
632   PetscCall(MatCreate_SeqSELL(B));
633   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636