xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision ed00a6c129cbc4f1b4754b50ab0c4e7536acfadd)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt   maxallocmat;
10   PetscInt   totalentries;
11   PetscInt  *colidx; /* column index array, device pointer */
12   MatScalar *val;    /* value array, device pointer */
13   PetscInt   totalslices;
14   PetscInt  *sliidx; /* slice index array, device pointer */
15   PetscInt   nonzerostate;
16   PetscInt   kernelchoice;
17   PetscInt   blocky;
18   PetscInt   chunksperblock;
19   PetscInt   totalchunks;
20   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
21 } Mat_SeqSELLCUDA;
22 
23 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
24 {
25   PetscFunctionBegin;
26   if (*cudastruct) {
27     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
28     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
29     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
30     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
31     PetscCall(PetscFree(*cudastruct));
32   }
33   PetscFunctionReturn(PETSC_SUCCESS);
34 }
35 
36 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
37 {
38   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
39   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
40 
41   PetscFunctionBegin;
42   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
43     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
44     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
45       /* copy values only */
46       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
47       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
48     } else {
49       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
50       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
51       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
52       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
53       cudastruct->maxallocmat  = a->maxallocmat;
54       cudastruct->totalentries = a->sliidx[a->totalslices];
55       cudastruct->totalslices  = a->totalslices;
56       cudastruct->totalchunks  = a->totalchunks;
57       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(*(cudastruct->colidx))));
58       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(*(cudastruct->val))));
59       /* copy values, nz or maxallocmat? */
60       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*(a->colidx)), cudaMemcpyHostToDevice));
61       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*(a->val)), cudaMemcpyHostToDevice));
62 
63       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(*(cudastruct->sliidx))));
64       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*(a->sliidx)), cudaMemcpyHostToDevice));
65       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(*(cudastruct->chunk_slice_map))));
66       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*(a->chunk_slice_map)), cudaMemcpyHostToDevice));
67       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
68     }
69     PetscCallCUDA(WaitForCUDA());
70     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
71     A->offloadmask = PETSC_OFFLOAD_BOTH;
72   }
73   PetscFunctionReturn(PETSC_SUCCESS);
74 }
75 
76 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
77 {
78   PetscInt  i, row, slice_id, row_in_slice;
79   MatScalar sum;
80   /* one thread per row. */
81   row = blockIdx.x * blockDim.x + threadIdx.x;
82   if (row < nrows) {
83     slice_id     = row / sliceheight;
84     row_in_slice = row % sliceheight;
85     sum          = 0.0;
86     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
87     y[row] = sum;
88   }
89 }
90 
91 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
92 {
93   PetscInt  i, row, slice_id, row_in_slice;
94   MatScalar sum;
95   /* one thread per row. */
96   row = blockIdx.x * blockDim.x + threadIdx.x;
97   if (row < nrows) {
98     slice_id     = row / sliceheight;
99     row_in_slice = row % sliceheight;
100     sum          = 0.0;
101     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
102     z[row] = y[row] + sum;
103   }
104 }
105 
106 #if !defined(PETSC_USE_COMPLEX)
107 /* use 1 block per slice, suitable for large slice width */
108 template <int BLOCKY>
109 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
110 {
111   __shared__ MatScalar shared[32][BLOCKY];
112   PetscInt             i, row, slice_id = blockIdx.x;
113   int                  tid = threadIdx.x + threadIdx.y * 32;
114   /* transposed index */
115   int         tidx = tid % BLOCKY;
116   int         tidy = tid / BLOCKY;
117   PetscScalar t    = 0.0;
118 
119   row = slice_id * sliceheight + threadIdx.x % sliceheight;
120   if (row < nrows) {
121     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
122   }
123   #pragma unroll
124   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
125   /* transpose layout to reduce each row using warp shfl */
126   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
127   __syncthreads();
128   if (tidy < sliceheight) t = shared[tidy][tidx];
129   #pragma unroll
130   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
131   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
132   __syncthreads();
133   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
134 }
135 
136 /* use 1 block per slice, suitable for large slice width */
137 template <int BLOCKY>
138 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
139 {
140   __shared__ MatScalar shared[32][BLOCKY];
141   PetscInt             i, row, slice_id = blockIdx.x;
142   int                  tid = threadIdx.x + threadIdx.y * 32;
143   /* transposed index */
144   int         tidx = tid % BLOCKY;
145   int         tidy = tid / BLOCKY;
146   PetscScalar t    = 0.0;
147 
148   row = slice_id * sliceheight + threadIdx.x % sliceheight;
149   if (row < nrows) {
150     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
151   }
152   #pragma unroll
153   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
154   /* transpose layout to reduce each row using warp shfl */
155   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
156   __syncthreads();
157   if (tidy < sliceheight) t = shared[tidy][tidx];
158   #pragma unroll
159   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
160   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
161   __syncthreads();
162   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
163 }
164 
165 template <int BLOCKY>
166 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
167 {
168   bool head = true;
169   #pragma unroll
170   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
171     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
172     shared[threadIdx.x + threadIdx.y * 32] = 0;
173     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
174       shared[threadIdx.x + threadIdx.y * 32] = *val;
175       if (i == 1) head = false;
176     }
177     __syncthreads();
178     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
179     __syncthreads();
180   }
181   return head;
182 }
183 
184 /* load-balancing version. Chunksize is equal to the number of threads per block */
185 template <int BLOCKY>
186 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
187 {
188   __shared__ MatScalar shared[BLOCKY * 32];
189   PetscInt             gid, row, start_slice, cid;
190   PetscScalar          t = 0.0;
191   /* zero out y */
192   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
193     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
194     if (gid < nrows) y[gid] = 0.0;
195   }
196   for (int iter = 0; iter < chunksperblock; iter++) {
197     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
198     if (cid < totalchunks) {
199       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
200       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
201       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
202         __shared__ PetscInt flag[BLOCKY * 2];
203         bool                write;
204         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
205         /* find out the slice that this element belongs to */
206         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
207         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
208         row = slice_id * sliceheight + threadIdx.x % sliceheight;
209         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
210         __syncthreads();
211         write = segment_scan<BLOCKY>(flag, shared, &t);
212         if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t);
213         t = 0.0;
214       } else { /* this iteration covers only one slice */
215         row = start_slice * sliceheight + threadIdx.x % sliceheight;
216         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
217         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
218           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
219   /* reduction and write to output vector */
220   #pragma unroll
221           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
222           /* transpose layout to reduce each row using warp shfl */
223           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
224           __syncthreads();
225           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
226   #pragma unroll
227           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
228           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
229           __syncthreads();
230           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
231           t = 0.0;
232         }
233       }
234     }
235   }
236 }
237 
238 /* load-balancing version. Chunksize is equal to the number of threads per block */
239 template <int BLOCKY>
240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
241 {
242   __shared__ MatScalar shared[BLOCKY * 32];
243   PetscInt             gid, row, start_slice, cid;
244   PetscScalar          t = 0.0;
245   /* copy y to z */
246   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
247     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
248     if (gid < nrows) z[gid] = y[gid];
249   }
250   for (int iter = 0; iter < chunksperblock; iter++) {
251     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
252     if (cid < totalchunks) {
253       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
254       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
255       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
256         __shared__ PetscInt flag[BLOCKY * 2];
257         bool                write;
258         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
259         /* find out the slice that this element belongs to */
260         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
261         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
262         row = slice_id * sliceheight + threadIdx.x % sliceheight;
263         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
264         __syncthreads();
265         write = segment_scan<BLOCKY>(flag, shared, &t);
266         if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t);
267         t = 0.0;
268       } else { /* this iteration covers only one slice */
269         row = start_slice * sliceheight + threadIdx.x % sliceheight;
270         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
271         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
272           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
273   /* reduction and write to output vector */
274   #pragma unroll
275           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
276           /* transpose layout to reduce each row using warp shfl */
277           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
278           __syncthreads();
279           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
280   #pragma unroll
281           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
282           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
283           __syncthreads();
284           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
285           t = 0.0;
286         }
287       }
288     }
289   }
290 }
291 
292 /* use 1 warp per slice, suitable for small slice width */
293 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
294 {
295   PetscInt i, row, slice_id;
296   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
297   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
298   double t = 0.0;
299   if (row < nrows) {
300     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
301   }
302   #pragma unroll
303   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
304   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
305 }
306 
307 /* use 1 warp per slice, suitable for small slice width */
308 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
309 {
310   PetscInt i, row, slice_id;
311   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
312   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
313   double t = 0.0;
314   if (row < nrows) {
315     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
316   }
317   #pragma unroll
318   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
319   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
320 }
321 #endif
322 
323 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
324 
325 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
326 {
327   __shared__ MatScalar shared[512];
328   PetscInt             i, row, slice_id, row_in_slice;
329   /* multiple threads per row. */
330   row = blockIdx.x * blockDim.x + threadIdx.x;
331   if (row < nrows) {
332     slice_id     = row / SLICE_HEIGHT;
333     row_in_slice = row % SLICE_HEIGHT;
334 
335     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
336     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
337     __syncthreads();
338     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
339     __syncthreads();
340     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
341     __syncthreads();
342     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
343     __syncthreads();
344     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
345     __syncthreads();
346     if (threadIdx.y < 1) {
347       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
348       y[row] = shared[threadIdx.x];
349     }
350   }
351 }
352 
353 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
354 {
355   __shared__ MatScalar shared[512];
356   PetscInt             i, row, slice_id, row_in_slice;
357   /* multiple threads per row. */
358   row = blockIdx.x * blockDim.x + threadIdx.x;
359   if (row < nrows) {
360     slice_id     = row / SLICE_HEIGHT;
361     row_in_slice = row % SLICE_HEIGHT;
362 
363     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
364     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
365     __syncthreads();
366     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
367     __syncthreads();
368     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
369     __syncthreads();
370     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
371     __syncthreads();
372     if (threadIdx.y < 1) {
373       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
374       y[row] = shared[threadIdx.x];
375     }
376   }
377 }
378 
379 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
380 {
381   __shared__ MatScalar shared[512];
382   PetscInt             i, row, slice_id, row_in_slice;
383   /* multiple threads per row. */
384   row = blockIdx.x * blockDim.x + threadIdx.x;
385   if (row < nrows) {
386     slice_id     = row / SLICE_HEIGHT;
387     row_in_slice = row % SLICE_HEIGHT;
388 
389     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
390     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
391     __syncthreads();
392     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
393     __syncthreads();
394     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
395     __syncthreads();
396     if (threadIdx.y < 1) {
397       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
398       y[row] = shared[threadIdx.x];
399     }
400   }
401 }
402 
403 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
404 {
405   __shared__ MatScalar shared[512];
406   PetscInt             i, row, slice_id, row_in_slice;
407   /* multiple threads per row. */
408   row = blockIdx.x * blockDim.x + threadIdx.x;
409   if (row < nrows) {
410     slice_id     = row / SLICE_HEIGHT;
411     row_in_slice = row % SLICE_HEIGHT;
412 
413     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
414     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
415     __syncthreads();
416     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
417     __syncthreads();
418     if (threadIdx.y < 1) {
419       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
420       y[row] = shared[threadIdx.x];
421     }
422   }
423 }
424 
425 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
426 {
427   __shared__ MatScalar shared[512];
428   PetscInt             i, row, slice_id, row_in_slice;
429   /* multiple threads per row. */
430   row = blockIdx.x * blockDim.x + threadIdx.x;
431   if (row < nrows) {
432     slice_id     = row / SLICE_HEIGHT;
433     row_in_slice = row % SLICE_HEIGHT;
434 
435     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
436     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
437     __syncthreads();
438     if (threadIdx.y < 1) {
439       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
440       y[row] = shared[threadIdx.x];
441     }
442   }
443 }
444 
445 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
446 {
447   __shared__ MatScalar shared[512];
448   PetscInt             i, row, slice_id, row_in_slice;
449   /* multiple threads per row. */
450   row = blockIdx.x * blockDim.x + threadIdx.x;
451   if (row < nrows) {
452     slice_id     = row / SLICE_HEIGHT;
453     row_in_slice = row % SLICE_HEIGHT;
454 
455     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
456     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
457     __syncthreads();
458     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
459     __syncthreads();
460     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
461     __syncthreads();
462     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
463     __syncthreads();
464     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
465     __syncthreads();
466     if (threadIdx.y < 1) {
467       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
468       z[row] = y[row] + shared[threadIdx.x];
469     }
470   }
471 }
472 
473 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
474 {
475   __shared__ MatScalar shared[512];
476   PetscInt             i, row, slice_id, row_in_slice;
477   /* multiple threads per row. */
478   row = blockIdx.x * blockDim.x + threadIdx.x;
479   if (row < nrows) {
480     slice_id     = row / SLICE_HEIGHT;
481     row_in_slice = row % SLICE_HEIGHT;
482 
483     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
484     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
485     __syncthreads();
486     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
487     __syncthreads();
488     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
489     __syncthreads();
490     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
491     __syncthreads();
492     if (threadIdx.y < 1) {
493       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
494       z[row] = y[row] + shared[threadIdx.x];
495     }
496   }
497 }
498 
499 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
500 {
501   __shared__ MatScalar shared[512];
502   PetscInt             i, row, slice_id, row_in_slice;
503   /* multiple threads per row. */
504   row = blockIdx.x * blockDim.x + threadIdx.x;
505   if (row < nrows) {
506     slice_id     = row / SLICE_HEIGHT;
507     row_in_slice = row % SLICE_HEIGHT;
508 
509     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
510     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
511     __syncthreads();
512     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
513     __syncthreads();
514     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
515     __syncthreads();
516     if (threadIdx.y < 1) {
517       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
518       z[row] = y[row] + shared[threadIdx.x];
519     }
520   }
521 }
522 
523 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
524 {
525   __shared__ MatScalar shared[512];
526   PetscInt             i, row, slice_id, row_in_slice;
527   /* multiple threads per row. */
528   row = blockIdx.x * blockDim.x + threadIdx.x;
529   if (row < nrows) {
530     slice_id     = row / SLICE_HEIGHT;
531     row_in_slice = row % SLICE_HEIGHT;
532 
533     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
534     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
535     __syncthreads();
536     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
537     __syncthreads();
538     if (threadIdx.y < 1) {
539       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
540       z[row] = y[row] + shared[threadIdx.x];
541     }
542   }
543 }
544 
545 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
546 {
547   __shared__ MatScalar shared[512];
548   PetscInt             i, row, slice_id, row_in_slice;
549   /* multiple threads per row. */
550   row = blockIdx.x * blockDim.x + threadIdx.x;
551   if (row < nrows) {
552     slice_id     = row / SLICE_HEIGHT;
553     row_in_slice = row % SLICE_HEIGHT;
554 
555     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
556     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
557     __syncthreads();
558     if (threadIdx.y < 1) {
559       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
560       z[row] = y[row] + shared[threadIdx.x];
561     }
562   }
563 }
564 
565 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
566 {
567   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
568   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
569   PetscScalar       *y;
570   const PetscScalar *x;
571   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
572   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
573   MatScalar         *aval;
574   PetscInt          *acolidx;
575   PetscInt          *sliidx;
576   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
577   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
578   PetscReal          maxoveravg;
579 
580   PetscFunctionBegin;
581   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
582   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
583   PetscCall(MatSeqSELLCUDACopyToGPU(A));
584   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
585   aval    = cudastruct->val;
586   acolidx = cudastruct->colidx;
587   sliidx  = cudastruct->sliidx;
588 
589   PetscCall(VecCUDAGetArrayRead(xx, &x));
590   PetscCall(VecCUDAGetArrayWrite(yy, &y));
591   PetscCall(PetscLogGpuTimeBegin());
592 
593   switch (cudastruct->kernelchoice) {
594 #if !defined(PETSC_USE_COMPLEX)
595   case 9:
596     nblocks = 1 + (nrows - 1) / sliceheight;
597     if (cudastruct->blocky == 2) {
598       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
599     } else if (cudastruct->blocky == 4) {
600       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
601     } else if (cudastruct->blocky == 8) {
602       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
603     } else if (cudastruct->blocky == 16) {
604       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
605     } else if (cudastruct->blocky == 32) {
606       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607     } else {
608       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609     }
610     break;
611   case 7:
612     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
613     if (cudastruct->blocky == 2) {
614       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615     } else if (cudastruct->blocky == 4) {
616       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
617     } else if (cudastruct->blocky == 8) {
618       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
619     } else if (cudastruct->blocky == 16) {
620       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621     } else if (cudastruct->blocky == 32) {
622       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623     } else {
624       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625     }
626     break;
627 #endif
628   case 6:
629     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
630     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
631     break;
632   case 5:
633     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
634     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
635     break;
636   case 4:
637     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
638     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
639     break;
640   case 3:
641     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
642     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
643     break;
644   case 2: /* 16 slices per block if blocksize=512 */
645     nblocks = 1 + (nrows - 1) / (blocksize / 2);
646     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
647     break;
648   case 1: /* 32 slices per block if blocksize=512 */
649     nblocks = 1 + (nrows - 1) / blocksize;
650     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
651     break;
652 #if !defined(PETSC_USE_COMPLEX)
653   case 0:
654     maxoveravg = a->maxslicewidth / a->avgslicewidth;
655     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
656       /* each block handles approximately one slice */
657       PetscInt blocky = a->chunksize / 32;
658       nchunks         = cudastruct->totalchunks;
659       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
660       nblocks         = 1 + (nchunks - 1) / chunksperblock;
661       chunk_slice_map = cudastruct->chunk_slice_map;
662       if (blocky == 2) {
663         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
664       } else if (blocky == 4) {
665         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
666       } else if (blocky == 8) {
667         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
668       } else if (blocky == 16) {
669         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
670       } else if (blocky == 32) {
671         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
672       } else {
673         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
674       }
675     } else {
676       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
677       if (avgslicesize <= 432) {
678         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
679           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
680           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
681         } else {
682           nblocks = 1 + (nrows - 1) / sliceheight;
683           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684         }
685       } else if (avgslicesize <= 2400) {
686         nblocks = 1 + (nrows - 1) / sliceheight;
687         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
688       } else {
689         nblocks = 1 + (nrows - 1) / sliceheight;
690         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691       }
692     }
693     break;
694 #endif
695   default:
696     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
697   }
698   PetscCall(PetscLogGpuTimeEnd());
699   PetscCall(VecCUDARestoreArrayRead(xx, &x));
700   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
701   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
702   PetscFunctionReturn(PETSC_SUCCESS);
703 }
704 
705 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
706 {
707   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
708   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
709   PetscScalar       *z;
710   const PetscScalar *y, *x;
711   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
712   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
713   MatScalar         *aval    = cudastruct->val;
714   PetscInt          *acolidx = cudastruct->colidx;
715   PetscInt          *sliidx  = cudastruct->sliidx;
716   PetscReal          maxoveravg;
717 
718   PetscFunctionBegin;
719   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
720   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
721   PetscCall(MatSeqSELLCUDACopyToGPU(A));
722   if (a->nz) {
723     PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512;
724     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
725     PetscCall(VecCUDAGetArrayRead(xx, &x));
726     PetscCall(VecCUDAGetArrayRead(yy, &y));
727     PetscCall(VecCUDAGetArrayWrite(zz, &z));
728     PetscCall(PetscLogGpuTimeBegin());
729 
730     switch (cudastruct->kernelchoice) {
731 #if !defined(PETSC_USE_COMPLEX)
732     case 9:
733       nblocks = 1 + (nrows - 1) / sliceheight;
734       if (blocky == 2) {
735         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
736       } else if (blocky == 4) {
737         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
738       } else if (blocky == 8) {
739         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
740       } else if (blocky == 16) {
741         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742       } else if (blocky == 32) {
743         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744       } else {
745         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746       }
747       break;
748     case 8:
749       /* each block handles approximately one slice */
750       nchunks         = cudastruct->totalchunks;
751       blocky          = a->chunksize / 32;
752       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
753       nblocks         = 1 + (nchunks - 1) / chunksperblock;
754       chunk_slice_map = cudastruct->chunk_slice_map;
755       if (blocky == 2) {
756         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
757       } else if (blocky == 4) {
758         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
759       } else if (blocky == 8) {
760         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761       } else if (blocky == 16) {
762         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763       } else if (blocky == 32) {
764         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765       } else {
766         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767       }
768       break;
769     case 7:
770       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
771       if (blocky == 2) {
772         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
773       } else if (blocky == 4) {
774         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775       } else if (blocky == 8) {
776         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777       } else if (blocky == 16) {
778         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779       } else if (blocky == 32) {
780         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781       } else {
782         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783       }
784       break;
785 #endif
786     case 6:
787       nblocks = 1 + (nrows - 1) / (blocksize / 32);
788       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
789       break;
790     case 5:
791       nblocks = 1 + (nrows - 1) / (blocksize / 16);
792       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
793       break;
794     case 4:
795       nblocks = 1 + (nrows - 1) / (blocksize / 8);
796       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
797       break;
798     case 3:
799       nblocks = 1 + (nrows - 1) / (blocksize / 4);
800       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
801       break;
802     case 2:
803       nblocks = 1 + (nrows - 1) / (blocksize / 2);
804       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
805       break;
806     case 1:
807       nblocks = 1 + (nrows - 1) / blocksize;
808       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
809       break;
810 #if !defined(PETSC_USE_COMPLEX)
811     case 0:
812       maxoveravg = a->maxslicewidth / a->avgslicewidth;
813       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
814         /* each block handles approximately one slice */
815         nchunks         = cudastruct->totalchunks;
816         blocky          = a->chunksize / 32;
817         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
818         nblocks         = 1 + (nchunks - 1) / chunksperblock;
819         chunk_slice_map = cudastruct->chunk_slice_map;
820         if (blocky == 2) {
821           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
822         } else if (blocky == 4) {
823           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
824         } else if (blocky == 8) {
825           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
826         } else if (blocky == 16) {
827           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
828         } else if (blocky == 32) {
829           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830         } else {
831           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832         }
833       } else {
834         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
835         if (avgslicesize <= 432) {
836           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
837             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
838             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
839           } else {
840             nblocks = 1 + (nrows - 1) / sliceheight;
841             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
842           }
843         } else if (avgslicesize <= 2400) {
844           nblocks = 1 + (nrows - 1) / sliceheight;
845           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
846         } else {
847           nblocks = 1 + (nrows - 1) / sliceheight;
848           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
849         }
850       }
851       break;
852 #endif
853     default:
854       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
855     }
856     PetscCall(PetscLogGpuTimeEnd());
857     PetscCall(VecCUDARestoreArrayRead(xx, &x));
858     PetscCall(VecCUDARestoreArrayRead(yy, &y));
859     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
860     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
861   } else {
862     PetscCall(VecCopy(yy, zz));
863   }
864   PetscFunctionReturn(PETSC_SUCCESS);
865 }
866 
867 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
868 {
869   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
870   PetscInt         kernel, blocky;
871   PetscBool        flg;
872 
873   PetscFunctionBegin;
874   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
875   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
876   if (flg) {
877     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
878     cudastruct->blocky = blocky;
879   }
880   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
881   if (flg) {
882     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
883     cudastruct->kernelchoice = kernel;
884     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
885   }
886   PetscOptionsHeadEnd();
887   PetscFunctionReturn(PETSC_SUCCESS);
888 }
889 
890 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
891 {
892   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
893 
894   PetscFunctionBegin;
895   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
896   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
897   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
898   PetscFunctionReturn(PETSC_SUCCESS);
899 }
900 
901 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
902 {
903   PetscFunctionBegin;
904   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
905   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
906   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
907   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
908   A->ops->mult    = MatMult_SeqSELLCUDA;
909   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
910   PetscFunctionReturn(PETSC_SUCCESS);
911 }
912 
913 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
914 {
915   PetscBool    both = PETSC_FALSE;
916   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
917 
918   PetscFunctionBegin;
919   if (A->factortype == MAT_FACTOR_NONE) {
920     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
921     if (cudastruct->val) {
922       both = PETSC_TRUE;
923       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*(cudastruct->val))));
924     }
925   }
926   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
927   PetscCall(MatSeqSELLInvalidateDiagonal(A));
928   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
929   else A->offloadmask = PETSC_OFFLOAD_CPU;
930   PetscFunctionReturn(PETSC_SUCCESS);
931 }
932 
933 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
934 {
935   PetscFunctionBegin;
936   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
937   PetscCall(MatDestroy_SeqSELL(A));
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
942 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
943 {
944   PetscFunctionBegin;
945   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
946   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
947   PetscFunctionReturn(PETSC_SUCCESS);
948 }
949 
950 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
951 {
952   Mat_SeqSELLCUDA *cudastruct;
953 
954   PetscFunctionBegin;
955   PetscCall(PetscFree(B->defaultvectype));
956   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
957 
958   if (!B->spptr) {
959     if (B->factortype == MAT_FACTOR_NONE) {
960       PetscCall(PetscNew(&cudastruct));
961       B->spptr = cudastruct;
962     }
963   }
964 
965   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
966   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
967   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
968   B->ops->mult           = MatMult_SeqSELLCUDA;
969   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
970   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
971   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;
972 
973   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
974   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
975 
976   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
977   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
978   PetscFunctionReturn(PETSC_SUCCESS);
979 }
980 
981 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
982 {
983   PetscFunctionBegin;
984   PetscCall(MatCreate_SeqSELL(B));
985   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
986   PetscCall(MatSetFromOptions(B));
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989