xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision b921024ee4b1b8e0b257f8784c7a4ecadbc34b4e)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt   maxallocmat;
10   PetscInt   totalentries;
11   PetscInt  *colidx; /* column index array, device pointer */
12   MatScalar *val;    /* value array, device pointer */
13   PetscInt   totalslices;
14   PetscInt  *sliidx; /* slice index array, device pointer */
15   PetscInt   nonzerostate;
16   PetscInt   kernelchoice;
17   PetscInt   blocky;
18   PetscInt   chunksperblock;
19   PetscInt   totalchunks;
20   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
21 } Mat_SeqSELLCUDA;
22 
23 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
24 {
25   PetscFunctionBegin;
26   if (*cudastruct) {
27     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
28     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
29     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
30     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
31     PetscCall(PetscFree(*cudastruct));
32   }
33   PetscFunctionReturn(PETSC_SUCCESS);
34 }
35 
36 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
37 {
38   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
39   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
40 
41   PetscFunctionBegin;
42   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
43     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
44     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
45       /* copy values only */
46       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
47       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
48     } else {
49       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
50       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
51       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
52       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
53       cudastruct->maxallocmat  = a->maxallocmat;
54       cudastruct->totalentries = a->sliidx[a->totalslices];
55       cudastruct->totalslices  = a->totalslices;
56       cudastruct->totalchunks  = a->totalchunks;
57       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
58       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
59       /* copy values, nz or maxallocmat? */
60       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
61       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
62 
63       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
64       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
65       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(PetscInt)));
66       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(PetscInt), cudaMemcpyHostToDevice));
67       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
68     }
69     PetscCallCUDA(WaitForCUDA());
70     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
71     A->offloadmask = PETSC_OFFLOAD_BOTH;
72   }
73   PetscFunctionReturn(PETSC_SUCCESS);
74 }
75 
76 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
77 {
78   PetscInt  i, row, slice_id, row_in_slice;
79   MatScalar sum;
80   /* one thread per row. */
81   row = blockIdx.x * blockDim.x + threadIdx.x;
82   if (row < nrows) {
83     slice_id     = row / sliceheight;
84     row_in_slice = row % sliceheight;
85     sum          = 0.0;
86     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
87     y[row] = sum;
88   }
89 }
90 
91 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
92 {
93   PetscInt  i, row, slice_id, row_in_slice;
94   MatScalar sum;
95   /* one thread per row. */
96   row = blockIdx.x * blockDim.x + threadIdx.x;
97   if (row < nrows) {
98     slice_id     = row / sliceheight;
99     row_in_slice = row % sliceheight;
100     sum          = 0.0;
101     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
102     z[row] = y[row] + sum;
103   }
104 }
105 
106 /* use 1 block per slice, suitable for large slice width */
107 template <int BLOCKY>
108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
109 {
110   __shared__ MatScalar shared[32][BLOCKY];
111   PetscInt             i, row, slice_id = blockIdx.x;
112   int                  tid = threadIdx.x + threadIdx.y * 32;
113   /* transposed index */
114   int         tidx = tid % BLOCKY;
115   int         tidy = tid / BLOCKY;
116   PetscScalar t    = 0.0;
117 
118   row = slice_id * sliceheight + threadIdx.x % sliceheight;
119   if (row < nrows) {
120     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
121   }
122 #pragma unroll
123   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
124   /* transpose layout to reduce each row using warp shfl */
125   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
126   __syncthreads();
127   if (tidy < sliceheight) t = shared[tidy][tidx];
128 #pragma unroll
129   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
130   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
131   __syncthreads();
132   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
133 }
134 
135 /* use 1 block per slice, suitable for large slice width */
136 template <int BLOCKY>
137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138 {
139   __shared__ MatScalar shared[32][BLOCKY];
140   PetscInt             i, row, slice_id = blockIdx.x;
141   int                  tid = threadIdx.x + threadIdx.y * 32;
142   /* transposed index */
143   int         tidx = tid % BLOCKY;
144   int         tidy = tid / BLOCKY;
145   PetscScalar t    = 0.0;
146 
147   row = slice_id * sliceheight + threadIdx.x % sliceheight;
148   if (row < nrows) {
149     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150   }
151 #pragma unroll
152   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
153   /* transpose layout to reduce each row using warp shfl */
154   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155   __syncthreads();
156   if (tidy < sliceheight) t = shared[tidy][tidx];
157 #pragma unroll
158   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
159   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
160   __syncthreads();
161   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
162 }
163 
164 template <int BLOCKY>
165 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166 {
167   bool head = true;
168 #pragma unroll
169   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
171     shared[threadIdx.x + threadIdx.y * 32] = 0;
172     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173       shared[threadIdx.x + threadIdx.y * 32] = *val;
174       if (i == 1) head = false;
175     }
176     __syncthreads();
177     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178     __syncthreads();
179   }
180   return head;
181 }
182 
183 /* load-balancing version. Chunksize is equal to the number of threads per block */
184 template <int BLOCKY>
185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186 {
187   __shared__ MatScalar shared[BLOCKY * 32];
188   PetscInt             gid, row, start_slice, cid;
189   PetscScalar          t = 0.0;
190   /* zero out y */
191   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
192     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
193     if (gid < nrows) y[gid] = 0.0;
194   }
195   for (int iter = 0; iter < chunksperblock; iter++) {
196     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
197     if (cid < totalchunks) {
198       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
199       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
200       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
201         __shared__ PetscInt flag[BLOCKY * 2];
202         bool                write;
203         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
204         /* find out the slice that this element belongs to */
205         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
206         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
207         row = slice_id * sliceheight + threadIdx.x % sliceheight;
208         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
209         __syncthreads();
210         write = segment_scan<BLOCKY>(flag, shared, &t);
211         if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t);
212         t = 0.0;
213       } else { /* this iteration covers only one slice */
214         row = start_slice * sliceheight + threadIdx.x % sliceheight;
215         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
216         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
217           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
218 /* reduction and write to output vector */
219 #pragma unroll
220           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
221           /* transpose layout to reduce each row using warp shfl */
222           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
223           __syncthreads();
224           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
225 #pragma unroll
226           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
227           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
228           __syncthreads();
229           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
230           t = 0.0;
231         }
232       }
233     }
234   }
235 }
236 
237 /* load-balancing version. Chunksize is equal to the number of threads per block */
238 template <int BLOCKY>
239 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
240 {
241   __shared__ MatScalar shared[BLOCKY * 32];
242   PetscInt             gid, row, start_slice, cid;
243   PetscScalar          t = 0.0;
244   /* copy y to z */
245   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
246     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
247     if (gid < nrows) z[gid] = y[gid];
248   }
249   for (int iter = 0; iter < chunksperblock; iter++) {
250     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
251     if (cid < totalchunks) {
252       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
253       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
254       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
255         __shared__ PetscInt flag[BLOCKY * 2];
256         bool                write;
257         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
258         /* find out the slice that this element belongs to */
259         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
260         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
261         row = slice_id * sliceheight + threadIdx.x % sliceheight;
262         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
263         __syncthreads();
264         write = segment_scan<BLOCKY>(flag, shared, &t);
265         if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t);
266         t = 0.0;
267       } else { /* this iteration covers only one slice */
268         row = start_slice * sliceheight + threadIdx.x % sliceheight;
269         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
270         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
271           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
272 /* reduction and write to output vector */
273 #pragma unroll
274           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
275           /* transpose layout to reduce each row using warp shfl */
276           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
277           __syncthreads();
278           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
279 #pragma unroll
280           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
281           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
282           __syncthreads();
283           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
284           t = 0.0;
285         }
286       }
287     }
288   }
289 }
290 
291 /* use 1 warp per slice, suitable for small slice width */
292 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
293 {
294   PetscInt i, row, slice_id;
295   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
296   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
297   double t = 0.0;
298   if (row < nrows) {
299     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
300   }
301 #pragma unroll
302   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
303   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
304 }
305 
306 /* use 1 warp per slice, suitable for small slice width */
307 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
308 {
309   PetscInt i, row, slice_id;
310   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
311   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
312   double t = 0.0;
313   if (row < nrows) {
314     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
315   }
316 #pragma unroll
317   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
318   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
319 }
320 
321 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
322 
323 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
324 {
325   __shared__ MatScalar shared[512];
326   PetscInt             i, row, slice_id, row_in_slice;
327   /* multiple threads per row. */
328   row = blockIdx.x * blockDim.x + threadIdx.x;
329   if (row < nrows) {
330     slice_id     = row / SLICE_HEIGHT;
331     row_in_slice = row % SLICE_HEIGHT;
332 
333     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
334     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
335     __syncthreads();
336     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
337     __syncthreads();
338     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
339     __syncthreads();
340     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
341     __syncthreads();
342     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
343     __syncthreads();
344     if (threadIdx.y < 1) {
345       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
346       y[row] = shared[threadIdx.x];
347     }
348   }
349 }
350 
351 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
352 {
353   __shared__ MatScalar shared[512];
354   PetscInt             i, row, slice_id, row_in_slice;
355   /* multiple threads per row. */
356   row = blockIdx.x * blockDim.x + threadIdx.x;
357   if (row < nrows) {
358     slice_id     = row / SLICE_HEIGHT;
359     row_in_slice = row % SLICE_HEIGHT;
360 
361     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
362     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
363     __syncthreads();
364     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
365     __syncthreads();
366     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
367     __syncthreads();
368     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
369     __syncthreads();
370     if (threadIdx.y < 1) {
371       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
372       y[row] = shared[threadIdx.x];
373     }
374   }
375 }
376 
377 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
378 {
379   __shared__ MatScalar shared[512];
380   PetscInt             i, row, slice_id, row_in_slice;
381   /* multiple threads per row. */
382   row = blockIdx.x * blockDim.x + threadIdx.x;
383   if (row < nrows) {
384     slice_id     = row / SLICE_HEIGHT;
385     row_in_slice = row % SLICE_HEIGHT;
386 
387     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
388     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
389     __syncthreads();
390     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
391     __syncthreads();
392     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
393     __syncthreads();
394     if (threadIdx.y < 1) {
395       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
396       y[row] = shared[threadIdx.x];
397     }
398   }
399 }
400 
401 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
402 {
403   __shared__ MatScalar shared[512];
404   PetscInt             i, row, slice_id, row_in_slice;
405   /* multiple threads per row. */
406   row = blockIdx.x * blockDim.x + threadIdx.x;
407   if (row < nrows) {
408     slice_id     = row / SLICE_HEIGHT;
409     row_in_slice = row % SLICE_HEIGHT;
410 
411     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
412     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
413     __syncthreads();
414     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
415     __syncthreads();
416     if (threadIdx.y < 1) {
417       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
418       y[row] = shared[threadIdx.x];
419     }
420   }
421 }
422 
423 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
424 {
425   __shared__ MatScalar shared[512];
426   PetscInt             i, row, slice_id, row_in_slice;
427   /* multiple threads per row. */
428   row = blockIdx.x * blockDim.x + threadIdx.x;
429   if (row < nrows) {
430     slice_id     = row / SLICE_HEIGHT;
431     row_in_slice = row % SLICE_HEIGHT;
432 
433     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
434     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
435     __syncthreads();
436     if (threadIdx.y < 1) {
437       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
438       y[row] = shared[threadIdx.x];
439     }
440   }
441 }
442 
443 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
444 {
445   __shared__ MatScalar shared[512];
446   PetscInt             i, row, slice_id, row_in_slice;
447   /* multiple threads per row. */
448   row = blockIdx.x * blockDim.x + threadIdx.x;
449   if (row < nrows) {
450     slice_id     = row / SLICE_HEIGHT;
451     row_in_slice = row % SLICE_HEIGHT;
452 
453     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
454     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
455     __syncthreads();
456     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
457     __syncthreads();
458     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
459     __syncthreads();
460     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
461     __syncthreads();
462     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
463     __syncthreads();
464     if (threadIdx.y < 1) {
465       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
466       z[row] = y[row] + shared[threadIdx.x];
467     }
468   }
469 }
470 
471 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
472 {
473   __shared__ MatScalar shared[512];
474   PetscInt             i, row, slice_id, row_in_slice;
475   /* multiple threads per row. */
476   row = blockIdx.x * blockDim.x + threadIdx.x;
477   if (row < nrows) {
478     slice_id     = row / SLICE_HEIGHT;
479     row_in_slice = row % SLICE_HEIGHT;
480 
481     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
482     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
483     __syncthreads();
484     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
485     __syncthreads();
486     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
487     __syncthreads();
488     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
489     __syncthreads();
490     if (threadIdx.y < 1) {
491       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
492       z[row] = y[row] + shared[threadIdx.x];
493     }
494   }
495 }
496 
497 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
498 {
499   __shared__ MatScalar shared[512];
500   PetscInt             i, row, slice_id, row_in_slice;
501   /* multiple threads per row. */
502   row = blockIdx.x * blockDim.x + threadIdx.x;
503   if (row < nrows) {
504     slice_id     = row / SLICE_HEIGHT;
505     row_in_slice = row % SLICE_HEIGHT;
506 
507     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
508     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
509     __syncthreads();
510     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
511     __syncthreads();
512     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
513     __syncthreads();
514     if (threadIdx.y < 1) {
515       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
516       z[row] = y[row] + shared[threadIdx.x];
517     }
518   }
519 }
520 
521 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
522 {
523   __shared__ MatScalar shared[512];
524   PetscInt             i, row, slice_id, row_in_slice;
525   /* multiple threads per row. */
526   row = blockIdx.x * blockDim.x + threadIdx.x;
527   if (row < nrows) {
528     slice_id     = row / SLICE_HEIGHT;
529     row_in_slice = row % SLICE_HEIGHT;
530 
531     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
532     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
533     __syncthreads();
534     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
535     __syncthreads();
536     if (threadIdx.y < 1) {
537       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
538       z[row] = y[row] + shared[threadIdx.x];
539     }
540   }
541 }
542 
543 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
544 {
545   __shared__ MatScalar shared[512];
546   PetscInt             i, row, slice_id, row_in_slice;
547   /* multiple threads per row. */
548   row = blockIdx.x * blockDim.x + threadIdx.x;
549   if (row < nrows) {
550     slice_id     = row / SLICE_HEIGHT;
551     row_in_slice = row % SLICE_HEIGHT;
552 
553     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
554     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
555     __syncthreads();
556     if (threadIdx.y < 1) {
557       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
558       z[row] = y[row] + shared[threadIdx.x];
559     }
560   }
561 }
562 
563 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
564 {
565   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
566   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
567   PetscScalar       *y;
568   const PetscScalar *x;
569   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
570   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
571   MatScalar         *aval;
572   PetscInt          *acolidx;
573   PetscInt          *sliidx;
574   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
575   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
576   PetscReal          maxoveravg;
577 
578   PetscFunctionBegin;
579   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
580   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
581   PetscCall(MatSeqSELLCUDACopyToGPU(A));
582   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
583   aval    = cudastruct->val;
584   acolidx = cudastruct->colidx;
585   sliidx  = cudastruct->sliidx;
586 
587   PetscCall(VecCUDAGetArrayRead(xx, &x));
588   PetscCall(VecCUDAGetArrayWrite(yy, &y));
589   PetscCall(PetscLogGpuTimeBegin());
590 
591   switch (cudastruct->kernelchoice) {
592   case 9:
593     nblocks = 1 + (nrows - 1) / sliceheight;
594     if (cudastruct->blocky == 2) {
595       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
596     } else if (cudastruct->blocky == 4) {
597       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
598     } else if (cudastruct->blocky == 8) {
599       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
600     } else if (cudastruct->blocky == 16) {
601       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
602     } else if (cudastruct->blocky == 32) {
603       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604     } else {
605       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606     }
607     break;
608   case 7:
609     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
610     if (cudastruct->blocky == 2) {
611       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612     } else if (cudastruct->blocky == 4) {
613       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
614     } else if (cudastruct->blocky == 8) {
615       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
616     } else if (cudastruct->blocky == 16) {
617       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
618     } else if (cudastruct->blocky == 32) {
619       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620     } else {
621       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622     }
623     break;
624   case 6:
625     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
626     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
627     break;
628   case 5:
629     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
630     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
631     break;
632   case 4:
633     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
634     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
635     break;
636   case 3:
637     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
638     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
639     break;
640   case 2: /* 16 slices per block if blocksize=512 */
641     nblocks = 1 + (nrows - 1) / (blocksize / 2);
642     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
643     break;
644   case 1: /* 32 slices per block if blocksize=512 */
645     nblocks = 1 + (nrows - 1) / blocksize;
646     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
647     break;
648   case 0:
649     maxoveravg = a->maxslicewidth / a->avgslicewidth;
650     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
651       /* each block handles approximately one slice */
652       PetscInt blocky = a->chunksize / 32;
653       nchunks         = cudastruct->totalchunks;
654       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
655       nblocks         = 1 + (nchunks - 1) / chunksperblock;
656       chunk_slice_map = cudastruct->chunk_slice_map;
657       if (blocky == 2) {
658         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
659       } else if (blocky == 4) {
660         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
661       } else if (blocky == 8) {
662         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
663       } else if (blocky == 16) {
664         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
665       } else if (blocky == 32) {
666         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667       } else {
668         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669       }
670     } else {
671       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
672       if (avgslicesize <= 432) {
673         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
674           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
675           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
676         } else {
677           nblocks = 1 + (nrows - 1) / sliceheight;
678           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
679         }
680       } else if (avgslicesize <= 2400) {
681         nblocks = 1 + (nrows - 1) / sliceheight;
682         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
683       } else {
684         nblocks = 1 + (nrows - 1) / sliceheight;
685         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
686       }
687     }
688     break;
689   }
690   PetscCall(PetscLogGpuTimeEnd());
691   PetscCall(VecCUDARestoreArrayRead(xx, &x));
692   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
693   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
694   PetscFunctionReturn(PETSC_SUCCESS);
695 }
696 
697 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
698 {
699   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
700   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
701   PetscScalar       *z;
702   const PetscScalar *y, *x;
703   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
704   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
705   MatScalar         *aval    = cudastruct->val;
706   PetscInt          *acolidx = cudastruct->colidx;
707   PetscInt          *sliidx  = cudastruct->sliidx;
708   PetscReal          maxoveravg;
709 
710   PetscFunctionBegin;
711   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
712   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
713   PetscCall(MatSeqSELLCUDACopyToGPU(A));
714   if (a->nz) {
715     PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512;
716     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
717     PetscCall(VecCUDAGetArrayRead(xx, &x));
718     PetscCall(VecCUDAGetArrayRead(yy, &y));
719     PetscCall(VecCUDAGetArrayWrite(zz, &z));
720     PetscCall(PetscLogGpuTimeBegin());
721 
722     switch (cudastruct->kernelchoice) {
723     case 9:
724       nblocks = 1 + (nrows - 1) / sliceheight;
725       if (blocky == 2) {
726         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
727       } else if (blocky == 4) {
728         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
729       } else if (blocky == 8) {
730         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
731       } else if (blocky == 16) {
732         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
733       } else if (blocky == 32) {
734         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
735       } else {
736         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
737       }
738       break;
739     case 8:
740       /* each block handles approximately one slice */
741       nchunks         = cudastruct->totalchunks;
742       blocky          = a->chunksize / 32;
743       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
744       nblocks         = 1 + (nchunks - 1) / chunksperblock;
745       chunk_slice_map = cudastruct->chunk_slice_map;
746       if (blocky == 2) {
747         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
748       } else if (blocky == 4) {
749         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
750       } else if (blocky == 8) {
751         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
752       } else if (blocky == 16) {
753         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
754       } else if (blocky == 32) {
755         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
756       } else {
757         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
758       }
759       break;
760     case 7:
761       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
762       if (blocky == 2) {
763         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
764       } else if (blocky == 4) {
765         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
766       } else if (blocky == 8) {
767         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
768       } else if (blocky == 16) {
769         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
770       } else if (blocky == 32) {
771         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
772       } else {
773         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
774       }
775       break;
776     case 6:
777       nblocks = 1 + (nrows - 1) / (blocksize / 32);
778       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
779       break;
780     case 5:
781       nblocks = 1 + (nrows - 1) / (blocksize / 16);
782       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
783       break;
784     case 4:
785       nblocks = 1 + (nrows - 1) / (blocksize / 8);
786       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
787       break;
788     case 3:
789       nblocks = 1 + (nrows - 1) / (blocksize / 4);
790       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
791       break;
792     case 2:
793       nblocks = 1 + (nrows - 1) / (blocksize / 2);
794       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
795       break;
796     case 1:
797       nblocks = 1 + (nrows - 1) / blocksize;
798       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
799       break;
800     case 0:
801       maxoveravg = a->maxslicewidth / a->avgslicewidth;
802       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
803         /* each block handles approximately one slice */
804         nchunks         = cudastruct->totalchunks;
805         blocky          = a->chunksize / 32;
806         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
807         nblocks         = 1 + (nchunks - 1) / chunksperblock;
808         chunk_slice_map = cudastruct->chunk_slice_map;
809         if (blocky == 2) {
810           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
811         } else if (blocky == 4) {
812           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
813         } else if (blocky == 8) {
814           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
815         } else if (blocky == 16) {
816           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
817         } else if (blocky == 32) {
818           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
819         } else {
820           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
821         }
822       } else {
823         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
824         if (avgslicesize <= 432) {
825           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
826             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
827             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
828           } else {
829             nblocks = 1 + (nrows - 1) / sliceheight;
830             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
831           }
832         } else if (avgslicesize <= 2400) {
833           nblocks = 1 + (nrows - 1) / sliceheight;
834           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
835         } else {
836           nblocks = 1 + (nrows - 1) / sliceheight;
837           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838         }
839       }
840       break;
841     }
842     PetscCall(PetscLogGpuTimeEnd());
843     PetscCall(VecCUDARestoreArrayRead(xx, &x));
844     PetscCall(VecCUDARestoreArrayRead(yy, &y));
845     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
846     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
847   } else {
848     PetscCall(VecCopy(yy, zz));
849   }
850   PetscFunctionReturn(PETSC_SUCCESS);
851 }
852 
853 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
854 {
855   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
856   PetscInt         kernel, blocky;
857   PetscBool        flg;
858 
859   PetscFunctionBegin;
860   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
861   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
862   if (flg) {
863     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
864     cudastruct->blocky = blocky;
865   }
866   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
867   if (flg) {
868     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
869     cudastruct->kernelchoice = kernel;
870     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
871   }
872   PetscOptionsHeadEnd();
873   PetscFunctionReturn(PETSC_SUCCESS);
874 }
875 
876 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
877 {
878   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
879 
880   PetscFunctionBegin;
881   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
882   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
883   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
884   PetscFunctionReturn(PETSC_SUCCESS);
885 }
886 
887 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
888 {
889   PetscFunctionBegin;
890   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
891   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
892   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
893   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
894   A->ops->mult    = MatMult_SeqSELLCUDA;
895   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
896   PetscFunctionReturn(PETSC_SUCCESS);
897 }
898 
899 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
900 {
901   PetscFunctionBegin;
902   if (A->factortype == MAT_FACTOR_NONE) {
903     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
904   }
905   PetscCall(MatDestroy_SeqSELL(A));
906   PetscFunctionReturn(PETSC_SUCCESS);
907 }
908 
909 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
910 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
911 {
912   PetscFunctionBegin;
913   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
914   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
915   PetscFunctionReturn(PETSC_SUCCESS);
916 }
917 
918 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
919 {
920   Mat_SeqSELLCUDA *cudastruct;
921 
922   PetscFunctionBegin;
923   PetscCall(PetscFree(B->defaultvectype));
924   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
925 
926   if (!B->spptr) {
927     if (B->factortype == MAT_FACTOR_NONE) {
928       PetscCall(PetscNew(&cudastruct));
929       B->spptr = cudastruct;
930     }
931   }
932 
933   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
934   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
935   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
936   B->ops->mult           = MatMult_SeqSELLCUDA;
937   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
938   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
939 
940   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
941   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
942 
943   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
944   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
945   PetscFunctionReturn(PETSC_SUCCESS);
946 }
947 
948 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
949 {
950   PetscFunctionBegin;
951   PetscCall(MatCreate_SeqSELL(B));
952   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
953   PetscFunctionReturn(PETSC_SUCCESS);
954 }
955