xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 07e43b41f7cb79bba1664e5473a3e2df04f149fa)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt  *colidx; /* column index */
10   MatScalar *val;
11   PetscInt  *sliidx;
12   PetscInt   nonzerostate;
13   PetscInt   kernelchoice;
14 } Mat_SeqSELLCUDA;
15 
16 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
17 {
18   PetscFunctionBegin;
19   if (*cudastruct) {
20     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
21     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
22     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
23     PetscCall(PetscFree(*cudastruct));
24   }
25   PetscFunctionReturn(PETSC_SUCCESS);
26 }
27 
28 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
29 {
30   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
31   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
32 
33   PetscFunctionBegin;
34   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
35     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
36     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
37       /* copy values only */
38       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
39       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
40     } else {
41       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
42       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
43       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
44       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
45       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
46       /* copy values, nz or maxallocmat? */
47       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
48       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
49 
50       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
51       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
52       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
53       cudastruct->nonzerostate = A->nonzerostate;
54     }
55     PetscCallCUDA(WaitForCUDA());
56     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
57     A->offloadmask = PETSC_OFFLOAD_BOTH;
58   }
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
63 {
64   PetscInt  i, row, slice_id, row_in_slice;
65   MatScalar sum;
66   /* one thread per row. */
67   row = blockIdx.x * blockDim.x + threadIdx.x;
68   if (row < nrows) {
69     slice_id     = row / SLICE_HEIGHT;
70     row_in_slice = row % SLICE_HEIGHT;
71     sum          = 0.0;
72     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
73     y[row] = sum;
74   }
75 }
76 
77 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
78 {
79   PetscInt  i, row, slice_id, row_in_slice;
80   MatScalar sum;
81   /* one thread per row. */
82   row = blockIdx.x * blockDim.x + threadIdx.x;
83   if (row < nrows) {
84     slice_id     = row / SLICE_HEIGHT;
85     row_in_slice = row % SLICE_HEIGHT;
86     sum          = 0.0;
87     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
88     z[row] = y[row] + sum;
89   }
90 }
91 
92 /* use 1 block per slice, suitable for large slice width */
93 template <int BLOCKY>
94 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
95 {
96   __shared__ MatScalar shared[SLICE_HEIGHT][BLOCKY];
97   PetscInt             i, row, slice_id;
98   slice_id = blockIdx.x;
99   row      = slice_id * SLICE_HEIGHT + threadIdx.x;
100   int tid  = threadIdx.x + threadIdx.y * SLICE_HEIGHT;
101   /* transposed index */
102   int         tidx = tid % BLOCKY;
103   int         tidy = tid / BLOCKY;
104   PetscScalar t    = 0.0;
105   if (row < nrows) {
106     for (i = sliidx[slice_id] + threadIdx.x + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * BLOCKY) t += aval[i] * x[acolidx[i]];
107   }
108   /* transpose layout to reduce each row using warp shfl */
109   shared[threadIdx.x][threadIdx.y] = t;
110   __syncthreads();
111   t = shared[tidy][tidx];
112 #pragma unroll
113   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
114   __syncthreads();
115   if (tidx == 0) { shared[0][tidy] = t; }
116   __syncthreads();
117   if (row < nrows && threadIdx.y == 0) { y[row] = shared[0][threadIdx.x]; }
118 }
119 
120 /* use 1 warp per slice, suitable for small slice width */
121 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
122 {
123   PetscInt i, row, slice_id;
124   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
125   row      = slice_id * SLICE_HEIGHT + threadIdx.x % SLICE_HEIGHT;
126   double t = 0.0;
127   if (row < nrows) {
128     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
129   }
130   t += __shfl_down_sync(0xffffffff, t, 16);
131   if (row < nrows && threadIdx.x < 16) { y[row] = t; }
132 }
133 
134 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
135 {
136   __shared__ MatScalar shared[512];
137   PetscInt             i, row, slice_id, row_in_slice;
138   /* multiple threads per row. */
139   row = blockIdx.x * blockDim.x + threadIdx.x;
140   if (row < nrows) {
141     slice_id     = row / SLICE_HEIGHT;
142     row_in_slice = row % SLICE_HEIGHT;
143 
144     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
145     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
146     __syncthreads();
147     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
148     __syncthreads();
149     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
150     __syncthreads();
151     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
152     __syncthreads();
153     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
154     __syncthreads();
155     if (threadIdx.y < 1) {
156       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
157       y[row] = shared[threadIdx.x];
158     }
159   }
160 }
161 
162 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
163 {
164   __shared__ MatScalar shared[512];
165   PetscInt             i, row, slice_id, row_in_slice;
166   /* multiple threads per row. */
167   row = blockIdx.x * blockDim.x + threadIdx.x;
168   if (row < nrows) {
169     slice_id     = row / SLICE_HEIGHT;
170     row_in_slice = row % SLICE_HEIGHT;
171 
172     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
173     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
174     __syncthreads();
175     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
176     __syncthreads();
177     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
178     __syncthreads();
179     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
180     __syncthreads();
181     if (threadIdx.y < 1) {
182       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
183       y[row] = shared[threadIdx.x];
184     }
185   }
186 }
187 
188 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
189 {
190   __shared__ MatScalar shared[512];
191   PetscInt             i, row, slice_id, row_in_slice;
192   /* multiple threads per row. */
193   row = blockIdx.x * blockDim.x + threadIdx.x;
194   if (row < nrows) {
195     slice_id     = row / SLICE_HEIGHT;
196     row_in_slice = row % SLICE_HEIGHT;
197 
198     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
199     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
200     __syncthreads();
201     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
202     __syncthreads();
203     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
204     __syncthreads();
205     if (threadIdx.y < 1) {
206       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
207       y[row] = shared[threadIdx.x];
208     }
209   }
210 }
211 
212 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
213 {
214   __shared__ MatScalar shared[512];
215   PetscInt             i, row, slice_id, row_in_slice;
216   /* multiple threads per row. */
217   row = blockIdx.x * blockDim.x + threadIdx.x;
218   if (row < nrows) {
219     slice_id     = row / SLICE_HEIGHT;
220     row_in_slice = row % SLICE_HEIGHT;
221 
222     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
223     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
224     __syncthreads();
225     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
226     __syncthreads();
227     if (threadIdx.y < 1) {
228       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
229       y[row] = shared[threadIdx.x];
230     }
231   }
232 }
233 
234 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
235 {
236   __shared__ MatScalar shared[512];
237   PetscInt             i, row, slice_id, row_in_slice;
238   /* multiple threads per row. */
239   row = blockIdx.x * blockDim.x + threadIdx.x;
240   if (row < nrows) {
241     slice_id     = row / SLICE_HEIGHT;
242     row_in_slice = row % SLICE_HEIGHT;
243 
244     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
245     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
246     __syncthreads();
247     if (threadIdx.y < 1) {
248       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
249       y[row] = shared[threadIdx.x];
250     }
251   }
252 }
253 
254 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
255 {
256   __shared__ MatScalar shared[512];
257   PetscInt             i, row, slice_id, row_in_slice;
258   /* multiple threads per row. */
259   row = blockIdx.x * blockDim.x + threadIdx.x;
260   if (row < nrows) {
261     slice_id     = row / SLICE_HEIGHT;
262     row_in_slice = row % SLICE_HEIGHT;
263 
264     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
265     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
266     __syncthreads();
267     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
268     __syncthreads();
269     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
270     __syncthreads();
271     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
272     __syncthreads();
273     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
274     __syncthreads();
275     if (threadIdx.y < 1) {
276       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
277       z[row] = y[row] + shared[threadIdx.x];
278     }
279   }
280 }
281 
282 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
283 {
284   __shared__ MatScalar shared[512];
285   PetscInt             i, row, slice_id, row_in_slice;
286   /* multiple threads per row. */
287   row = blockIdx.x * blockDim.x + threadIdx.x;
288   if (row < nrows) {
289     slice_id     = row / SLICE_HEIGHT;
290     row_in_slice = row % SLICE_HEIGHT;
291 
292     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
293     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
294     __syncthreads();
295     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
296     __syncthreads();
297     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
298     __syncthreads();
299     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
300     __syncthreads();
301     if (threadIdx.y < 1) {
302       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
303       z[row] = y[row] + shared[threadIdx.x];
304     }
305   }
306 }
307 
308 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
309 {
310   __shared__ MatScalar shared[512];
311   PetscInt             i, row, slice_id, row_in_slice;
312   /* multiple threads per row. */
313   row = blockIdx.x * blockDim.x + threadIdx.x;
314   if (row < nrows) {
315     slice_id     = row / SLICE_HEIGHT;
316     row_in_slice = row % SLICE_HEIGHT;
317 
318     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
319     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
320     __syncthreads();
321     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
322     __syncthreads();
323     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
324     __syncthreads();
325     if (threadIdx.y < 1) {
326       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
327       z[row] = y[row] + shared[threadIdx.x];
328     }
329   }
330 }
331 
332 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
333 {
334   __shared__ MatScalar shared[512];
335   PetscInt             i, row, slice_id, row_in_slice;
336   /* multiple threads per row. */
337   row = blockIdx.x * blockDim.x + threadIdx.x;
338   if (row < nrows) {
339     slice_id     = row / SLICE_HEIGHT;
340     row_in_slice = row % SLICE_HEIGHT;
341 
342     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
343     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
344     __syncthreads();
345     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
346     __syncthreads();
347     if (threadIdx.y < 1) {
348       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349       z[row] = y[row] + shared[threadIdx.x];
350     }
351   }
352 }
353 
354 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
355 {
356   __shared__ MatScalar shared[512];
357   PetscInt             i, row, slice_id, row_in_slice;
358   /* multiple threads per row. */
359   row = blockIdx.x * blockDim.x + threadIdx.x;
360   if (row < nrows) {
361     slice_id     = row / SLICE_HEIGHT;
362     row_in_slice = row % SLICE_HEIGHT;
363 
364     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366     __syncthreads();
367     if (threadIdx.y < 1) {
368       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
369       z[row] = y[row] + shared[threadIdx.x];
370     }
371   }
372 }
373 
374 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
375 {
376   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
377   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
378   PetscScalar       *y;
379   const PetscScalar *x;
380   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
381   MatScalar         *aval;
382   PetscInt          *acolidx;
383   PetscInt          *sliidx;
384   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
385   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
386   dim3               block_k8(32, SLICE_HEIGHT);
387 
388   PetscFunctionBegin;
389   PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
390   PetscCall(MatSeqSELLCUDACopyToGPU(A));
391   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
392   aval    = cudastruct->val;
393   acolidx = cudastruct->colidx;
394   sliidx  = cudastruct->sliidx;
395 
396   PetscCall(VecCUDAGetArrayRead(xx, &x));
397   PetscCall(VecCUDAGetArrayWrite(yy, &y));
398   PetscCall(PetscLogGpuTimeBegin());
399 
400   switch (cudastruct->kernelchoice) {
401   case 9:
402     if (a->maxslicewidth > 512) {
403       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
404       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
405     } else if (a->avgslicewidth < 8) {
406       nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
407       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
408     } else if (a->avgslicewidth < 16) {
409       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
410       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
411     } else if (a->avgslicewidth < 32) {
412       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
413       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
414     } else if (a->avgslicewidth < 64) {
415       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
416       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
417     } else {
418       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
419       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
420     }
421     break;
422   case 8:
423     nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
424     matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
425     break;
426   case 7:
427     nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
428     matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
429     break;
430   case 6:
431     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
432     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
433     break;
434   case 5:
435     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
436     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
437     break;
438   case 4:
439     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
440     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
441     break;
442   case 3:
443     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
444     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
445     break;
446   case 2: /* 16 slices per block if blocksize=512 */
447     nblocks = 1 + (nrows - 1) / (blocksize / 2);
448     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
449     break;
450   case 1: /* 32 slices per block if blocksize=512 */
451     nblocks = 1 + (nrows - 1) / blocksize;
452     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
453     break;
454   case 0:
455     if (a->maxslicewidth > 64) {
456       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
457       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
458     } else {
459       if (a->avgslicewidth < 8) {
460         nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
461         matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
462       } else if (a->avgslicewidth < 16) {
463         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
464         matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
465       } else if (a->avgslicewidth < 32) {
466         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
467         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
468       } else {
469         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
470         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
471       }
472     }
473     break;
474   }
475   PetscCall(PetscLogGpuTimeEnd());
476   PetscCall(VecCUDARestoreArrayRead(xx, &x));
477   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
478   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
479   PetscFunctionReturn(PETSC_SUCCESS);
480 }
481 
482 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
483 {
484   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
485   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
486   PetscScalar       *z;
487   const PetscScalar *y, *x;
488   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
489   MatScalar         *aval    = cudastruct->val;
490   PetscInt          *acolidx = cudastruct->colidx;
491   PetscInt          *sliidx  = cudastruct->sliidx;
492 
493   PetscFunctionBegin;
494   PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
495   PetscCall(MatSeqSELLCUDACopyToGPU(A));
496   if (a->nz) {
497     PetscInt nblocks, blocksize = 512;
498     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
499     PetscCall(VecCUDAGetArrayRead(xx, &x));
500     PetscCall(VecCUDAGetArrayRead(yy, &y));
501     PetscCall(VecCUDAGetArrayWrite(zz, &z));
502     PetscCall(PetscLogGpuTimeBegin());
503 
504     switch (cudastruct->kernelchoice) {
505     case 6:
506       nblocks = 1 + (nrows - 1) / (blocksize / 32);
507       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
508       break;
509     case 5:
510       nblocks = 1 + (nrows - 1) / (blocksize / 16);
511       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
512       break;
513     case 4:
514       nblocks = 1 + (nrows - 1) / (blocksize / 8);
515       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
516       break;
517     case 3:
518       nblocks = 1 + (nrows - 1) / (blocksize / 4);
519       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
520       break;
521     case 2:
522       nblocks = 1 + (nrows - 1) / (blocksize / 2);
523       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
524       break;
525     case 1:
526       nblocks = 1 + (nrows - 1) / blocksize;
527       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
528       break;
529     case 0: /* TODO */
530       break;
531     }
532     PetscCall(PetscLogGpuTimeEnd());
533     PetscCall(VecCUDARestoreArrayRead(xx, &x));
534     PetscCall(VecCUDARestoreArrayRead(yy, &y));
535     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
536     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
537   } else {
538     PetscCall(VecCopy(yy, zz));
539   }
540   PetscFunctionReturn(PETSC_SUCCESS);
541 }
542 
543 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
544 {
545   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
546   PetscInt         kernel;
547   PetscBool        flg;
548 
549   PetscFunctionBegin;
550   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
551   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
552   if (flg) {
553     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
554     cudastruct->kernelchoice = kernel;
555   }
556   PetscOptionsHeadEnd();
557   PetscFunctionReturn(PETSC_SUCCESS);
558 }
559 
560 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
561 {
562   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
563 
564   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
565   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
566   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
567   PetscFunctionReturn(PETSC_SUCCESS);
568 }
569 
570 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
571 {
572   PetscFunctionBegin;
573   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
574   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
575   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
576   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
577   A->ops->mult    = MatMult_SeqSELLCUDA;
578   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
579   PetscFunctionReturn(PETSC_SUCCESS);
580 }
581 
582 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
583 {
584   PetscFunctionBegin;
585   if (A->factortype == MAT_FACTOR_NONE) {
586     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
587   }
588   PetscCall(MatDestroy_SeqSELL(A));
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 
592 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
593 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
594 {
595   PetscFunctionBegin;
596   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
597   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
601 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
602 {
603   Mat_SeqSELLCUDA *cudastruct;
604 
605   PetscFunctionBegin;
606   PetscCall(PetscFree(B->defaultvectype));
607   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
608 
609   if (!B->spptr) {
610     if (B->factortype == MAT_FACTOR_NONE) {
611       PetscCall(PetscNew(&cudastruct));
612       B->spptr = cudastruct;
613     }
614   }
615 
616   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
617   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
618   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
619   B->ops->mult           = MatMult_SeqSELLCUDA;
620   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
621   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
622 
623   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
624   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
625 
626   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
627   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
632 {
633   PetscFunctionBegin;
634   PetscCall(MatCreate_SeqSELL(B));
635   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
636   PetscFunctionReturn(PETSC_SUCCESS);
637 }
638