xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision cca9ff8b0ed15f9f7ff3d370e0ced223f69d26ef)
1 #include <cuda_runtime.h>
2 
3 #include <petscdevice_cuda.h>
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 
6 #define SLICE_HEIGHT 16
7 
8 typedef struct {
9   PetscInt  *colidx; /* column index */
10   MatScalar *val;
11   PetscInt  *sliidx;
12   PetscInt   nonzerostate;
13   PetscInt   kernelchoice;
14   PetscInt   blocky;
15 } Mat_SeqSELLCUDA;
16 
17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
18 {
19   PetscFunctionBegin;
20   if (*cudastruct) {
21     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
22     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
23     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
24     PetscCall(PetscFree(*cudastruct));
25   }
26   PetscFunctionReturn(PETSC_SUCCESS);
27 }
28 
29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
30 {
31   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
32   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
33 
34   PetscFunctionBegin;
35   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
36     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
37     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
38       /* copy values only */
39       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
40       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
41     } else {
42       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
43       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
44       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
45       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
46       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
47       /* copy values, nz or maxallocmat? */
48       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
49       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
50 
51       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
52       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
53       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
54       cudastruct->nonzerostate = A->nonzerostate;
55     }
56     PetscCallCUDA(WaitForCUDA());
57     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
58     A->offloadmask = PETSC_OFFLOAD_BOTH;
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
64 {
65   PetscInt  i, row, slice_id, row_in_slice;
66   MatScalar sum;
67   /* one thread per row. */
68   row = blockIdx.x * blockDim.x + threadIdx.x;
69   if (row < nrows) {
70     slice_id     = row / sliceheight;
71     row_in_slice = row % sliceheight;
72     sum          = 0.0;
73     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
74     y[row] = sum;
75   }
76 }
77 
78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
79 {
80   PetscInt  i, row, slice_id, row_in_slice;
81   MatScalar sum;
82   /* one thread per row. */
83   row = blockIdx.x * blockDim.x + threadIdx.x;
84   if (row < nrows) {
85     slice_id     = row / sliceheight;
86     row_in_slice = row % sliceheight;
87     sum          = 0.0;
88     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89     z[row] = y[row] + sum;
90   }
91 }
92 
93 /* use 1 block per slice, suitable for large slice width */
94 template <int BLOCKY>
95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
96 {
97   __shared__ MatScalar shared[32][BLOCKY];
98   PetscInt             i, row, slice_id = blockIdx.x;
99   int                  tid = threadIdx.x + threadIdx.y * 32;
100   /* transposed index */
101   int         tidx = tid % BLOCKY;
102   int         tidy = tid / BLOCKY;
103   PetscScalar t    = 0.0;
104 
105   row = slice_id * sliceheight + threadIdx.x % sliceheight;
106   if (row < nrows) {
107     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
108   }
109 #pragma unroll
110   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
111   /* transpose layout to reduce each row using warp shfl */
112   shared[threadIdx.x][threadIdx.y] = t;
113   __syncthreads();
114   t = shared[tidy][tidx];
115 #pragma unroll
116   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
117   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
118   __syncthreads();
119   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
120 }
121 
122 /* use 1 block per slice, suitable for large slice width */
123 template <int BLOCKY>
124 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
125 {
126   __shared__ MatScalar shared[32][BLOCKY];
127   PetscInt             i, row, slice_id = blockIdx.x;
128   int                  tid = threadIdx.x + threadIdx.y * 32;
129   /* transposed index */
130   int         tidx = tid % BLOCKY;
131   int         tidy = tid / BLOCKY;
132   PetscScalar t    = 0.0;
133 
134   row = slice_id * sliceheight + threadIdx.x % sliceheight;
135   if (row < nrows) {
136     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
137   }
138 #pragma unroll
139   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
140   /* transpose layout to reduce each row using warp shfl */
141   shared[threadIdx.x][threadIdx.y] = t;
142   __syncthreads();
143   t = shared[tidy][tidx];
144 #pragma unroll
145   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
146   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
147   __syncthreads();
148   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
149 }
150 
151 /* use 1 warp per slice, suitable for small slice width */
152 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
153 {
154   PetscInt i, row, slice_id;
155   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
156   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
157   double t = 0.0;
158   if (row < nrows) {
159     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
160   }
161 #pragma unroll
162   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
163   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
164 }
165 
166 /* use 1 warp per slice, suitable for small slice width */
167 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
168 {
169   PetscInt i, row, slice_id;
170   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
171   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
172   double t = 0.0;
173   if (row < nrows) {
174     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
175   }
176 #pragma unroll
177   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
178   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
179 }
180 
181 /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
182 
183 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
184 {
185   __shared__ MatScalar shared[512];
186   PetscInt             i, row, slice_id, row_in_slice;
187   /* multiple threads per row. */
188   row = blockIdx.x * blockDim.x + threadIdx.x;
189   if (row < nrows) {
190     slice_id     = row / SLICE_HEIGHT;
191     row_in_slice = row % SLICE_HEIGHT;
192 
193     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
194     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
195     __syncthreads();
196     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
197     __syncthreads();
198     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
199     __syncthreads();
200     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
201     __syncthreads();
202     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
203     __syncthreads();
204     if (threadIdx.y < 1) {
205       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
206       y[row] = shared[threadIdx.x];
207     }
208   }
209 }
210 
211 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
212 {
213   __shared__ MatScalar shared[512];
214   PetscInt             i, row, slice_id, row_in_slice;
215   /* multiple threads per row. */
216   row = blockIdx.x * blockDim.x + threadIdx.x;
217   if (row < nrows) {
218     slice_id     = row / SLICE_HEIGHT;
219     row_in_slice = row % SLICE_HEIGHT;
220 
221     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
222     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
223     __syncthreads();
224     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
225     __syncthreads();
226     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
227     __syncthreads();
228     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
229     __syncthreads();
230     if (threadIdx.y < 1) {
231       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
232       y[row] = shared[threadIdx.x];
233     }
234   }
235 }
236 
237 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
238 {
239   __shared__ MatScalar shared[512];
240   PetscInt             i, row, slice_id, row_in_slice;
241   /* multiple threads per row. */
242   row = blockIdx.x * blockDim.x + threadIdx.x;
243   if (row < nrows) {
244     slice_id     = row / SLICE_HEIGHT;
245     row_in_slice = row % SLICE_HEIGHT;
246 
247     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
248     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
249     __syncthreads();
250     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
251     __syncthreads();
252     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
253     __syncthreads();
254     if (threadIdx.y < 1) {
255       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
256       y[row] = shared[threadIdx.x];
257     }
258   }
259 }
260 
261 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
262 {
263   __shared__ MatScalar shared[512];
264   PetscInt             i, row, slice_id, row_in_slice;
265   /* multiple threads per row. */
266   row = blockIdx.x * blockDim.x + threadIdx.x;
267   if (row < nrows) {
268     slice_id     = row / SLICE_HEIGHT;
269     row_in_slice = row % SLICE_HEIGHT;
270 
271     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
272     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
273     __syncthreads();
274     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
275     __syncthreads();
276     if (threadIdx.y < 1) {
277       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
278       y[row] = shared[threadIdx.x];
279     }
280   }
281 }
282 
283 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
284 {
285   __shared__ MatScalar shared[512];
286   PetscInt             i, row, slice_id, row_in_slice;
287   /* multiple threads per row. */
288   row = blockIdx.x * blockDim.x + threadIdx.x;
289   if (row < nrows) {
290     slice_id     = row / SLICE_HEIGHT;
291     row_in_slice = row % SLICE_HEIGHT;
292 
293     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
294     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
295     __syncthreads();
296     if (threadIdx.y < 1) {
297       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
298       y[row] = shared[threadIdx.x];
299     }
300   }
301 }
302 
303 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
304 {
305   __shared__ MatScalar shared[512];
306   PetscInt             i, row, slice_id, row_in_slice;
307   /* multiple threads per row. */
308   row = blockIdx.x * blockDim.x + threadIdx.x;
309   if (row < nrows) {
310     slice_id     = row / SLICE_HEIGHT;
311     row_in_slice = row % SLICE_HEIGHT;
312 
313     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
314     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
315     __syncthreads();
316     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
317     __syncthreads();
318     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
319     __syncthreads();
320     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
321     __syncthreads();
322     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
323     __syncthreads();
324     if (threadIdx.y < 1) {
325       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
326       z[row] = y[row] + shared[threadIdx.x];
327     }
328   }
329 }
330 
331 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
332 {
333   __shared__ MatScalar shared[512];
334   PetscInt             i, row, slice_id, row_in_slice;
335   /* multiple threads per row. */
336   row = blockIdx.x * blockDim.x + threadIdx.x;
337   if (row < nrows) {
338     slice_id     = row / SLICE_HEIGHT;
339     row_in_slice = row % SLICE_HEIGHT;
340 
341     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343     __syncthreads();
344     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
345     __syncthreads();
346     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
347     __syncthreads();
348     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
349     __syncthreads();
350     if (threadIdx.y < 1) {
351       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
352       z[row] = y[row] + shared[threadIdx.x];
353     }
354   }
355 }
356 
357 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
358 {
359   __shared__ MatScalar shared[512];
360   PetscInt             i, row, slice_id, row_in_slice;
361   /* multiple threads per row. */
362   row = blockIdx.x * blockDim.x + threadIdx.x;
363   if (row < nrows) {
364     slice_id     = row / SLICE_HEIGHT;
365     row_in_slice = row % SLICE_HEIGHT;
366 
367     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
368     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
369     __syncthreads();
370     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
371     __syncthreads();
372     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
373     __syncthreads();
374     if (threadIdx.y < 1) {
375       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
376       z[row] = y[row] + shared[threadIdx.x];
377     }
378   }
379 }
380 
381 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
382 {
383   __shared__ MatScalar shared[512];
384   PetscInt             i, row, slice_id, row_in_slice;
385   /* multiple threads per row. */
386   row = blockIdx.x * blockDim.x + threadIdx.x;
387   if (row < nrows) {
388     slice_id     = row / SLICE_HEIGHT;
389     row_in_slice = row % SLICE_HEIGHT;
390 
391     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
392     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
393     __syncthreads();
394     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
395     __syncthreads();
396     if (threadIdx.y < 1) {
397       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
398       z[row] = y[row] + shared[threadIdx.x];
399     }
400   }
401 }
402 
403 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
404 {
405   __shared__ MatScalar shared[512];
406   PetscInt             i, row, slice_id, row_in_slice;
407   /* multiple threads per row. */
408   row = blockIdx.x * blockDim.x + threadIdx.x;
409   if (row < nrows) {
410     slice_id     = row / SLICE_HEIGHT;
411     row_in_slice = row % SLICE_HEIGHT;
412 
413     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
414     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
415     __syncthreads();
416     if (threadIdx.y < 1) {
417       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
418       z[row] = y[row] + shared[threadIdx.x];
419     }
420   }
421 }
422 
423 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
424 {
425   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
426   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
427   PetscScalar       *y;
428   const PetscScalar *x;
429   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
430   MatScalar         *aval;
431   PetscInt          *acolidx;
432   PetscInt          *sliidx;
433   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
434   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
435 
436   PetscFunctionBegin;
437   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
438   PetscCall(MatSeqSELLCUDACopyToGPU(A));
439   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
440   aval    = cudastruct->val;
441   acolidx = cudastruct->colidx;
442   sliidx  = cudastruct->sliidx;
443 
444   PetscCall(VecCUDAGetArrayRead(xx, &x));
445   PetscCall(VecCUDAGetArrayWrite(yy, &y));
446   PetscCall(PetscLogGpuTimeBegin());
447 
448   switch (cudastruct->kernelchoice) {
449   case 9:
450     nblocks = 1 + (nrows - 1) / sliceheight;
451     if (cudastruct->blocky == 2) {
452       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
453     } else if (cudastruct->blocky == 4) {
454       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
455     } else if (cudastruct->blocky == 8) {
456       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
457     } else if (cudastruct->blocky == 16) {
458       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
459     } else if (cudastruct->blocky == 32) {
460       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
461     } else {
462       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
463     }
464     break;
465   case 7:
466     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
467     if (cudastruct->blocky == 2) {
468       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
469     } else if (cudastruct->blocky == 4) {
470       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
471     } else if (cudastruct->blocky == 8) {
472       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
473     } else if (cudastruct->blocky == 16) {
474       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
475     } else if (cudastruct->blocky == 32) {
476       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
477     } else {
478       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
479     }
480     break;
481   case 6:
482     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
483     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
484     break;
485   case 5:
486     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
487     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
488     break;
489   case 4:
490     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
491     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
492     break;
493   case 3:
494     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
495     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
496     break;
497   case 2: /* 16 slices per block if blocksize=512 */
498     nblocks = 1 + (nrows - 1) / (blocksize / 2);
499     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
500     break;
501   case 1: /* 32 slices per block if blocksize=512 */
502     nblocks = 1 + (nrows - 1) / blocksize;
503     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
504     break;
505   case 0:
506     if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */
507       nblocks = 1 + (nrows - 1) / sliceheight;
508       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
509     } else {
510       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
511       if (avgslicesize <= 96) {
512         nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
513         matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
514       } else if (avgslicesize <= 432) {
515         nblocks = 1 + (nrows - 1) / sliceheight;
516         matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
517       } else if (avgslicesize <= 2400) {
518         nblocks = 1 + (nrows - 1) / sliceheight;
519         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
520       } else {
521         nblocks = 1 + (nrows - 1) / sliceheight;
522         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
523       }
524     }
525     break;
526   }
527   PetscCall(PetscLogGpuTimeEnd());
528   PetscCall(VecCUDARestoreArrayRead(xx, &x));
529   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
530   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
537   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
538   PetscScalar       *z;
539   const PetscScalar *y, *x;
540   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
541   MatScalar         *aval    = cudastruct->val;
542   PetscInt          *acolidx = cudastruct->colidx;
543   PetscInt          *sliidx  = cudastruct->sliidx;
544 
545   PetscFunctionBegin;
546   PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
547   PetscCall(MatSeqSELLCUDACopyToGPU(A));
548   if (a->nz) {
549     PetscInt nblocks, blocksize = 512;
550     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
551     PetscCall(VecCUDAGetArrayRead(xx, &x));
552     PetscCall(VecCUDAGetArrayRead(yy, &y));
553     PetscCall(VecCUDAGetArrayWrite(zz, &z));
554     PetscCall(PetscLogGpuTimeBegin());
555 
556     switch (cudastruct->kernelchoice) {
557     case 9:
558       nblocks = 1 + (nrows - 1) / sliceheight;
559       if (cudastruct->blocky == 2) {
560         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
561       } else if (cudastruct->blocky == 4) {
562         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
563       } else if (cudastruct->blocky == 8) {
564         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
565       } else if (cudastruct->blocky == 16) {
566         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
567       } else if (cudastruct->blocky == 32) {
568         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
569       } else {
570         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
571       }
572       break;
573     case 7:
574       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
575       if (cudastruct->blocky == 2) {
576         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
577       } else if (cudastruct->blocky == 4) {
578         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
579       } else if (cudastruct->blocky == 8) {
580         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
581       } else if (cudastruct->blocky == 16) {
582         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
583       } else if (cudastruct->blocky == 32) {
584         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
585       } else {
586         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
587       }
588       break;
589     case 6:
590       nblocks = 1 + (nrows - 1) / (blocksize / 32);
591       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
592       break;
593     case 5:
594       nblocks = 1 + (nrows - 1) / (blocksize / 16);
595       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
596       break;
597     case 4:
598       nblocks = 1 + (nrows - 1) / (blocksize / 8);
599       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
600       break;
601     case 3:
602       nblocks = 1 + (nrows - 1) / (blocksize / 4);
603       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
604       break;
605     case 2:
606       nblocks = 1 + (nrows - 1) / (blocksize / 2);
607       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
608       break;
609     case 1:
610       nblocks = 1 + (nrows - 1) / blocksize;
611       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
612       break;
613     case 0:
614       if (sliceheight * a->maxslicewidth > 20800) {
615         nblocks = 1 + (nrows - 1) / sliceheight;
616         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
617       } else {
618         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
619         if (avgslicesize <= 96) {
620           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
621           matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
622         } else if (avgslicesize <= 432) {
623           nblocks = 1 + (nrows - 1) / sliceheight;
624           matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
625         } else if (avgslicesize <= 2400) {
626           nblocks = 1 + (nrows - 1) / sliceheight;
627           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
628         } else {
629           nblocks = 1 + (nrows - 1) / sliceheight;
630           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
631         }
632       }
633       break;
634     }
635     PetscCall(PetscLogGpuTimeEnd());
636     PetscCall(VecCUDARestoreArrayRead(xx, &x));
637     PetscCall(VecCUDARestoreArrayRead(yy, &y));
638     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
639     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
640   } else {
641     PetscCall(VecCopy(yy, zz));
642   }
643   PetscFunctionReturn(PETSC_SUCCESS);
644 }
645 
646 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
647 {
648   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
649   PetscInt         kernel, blocky;
650   PetscBool        flg;
651 
652   PetscFunctionBegin;
653   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
654   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
655   if (flg) {
656     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
657     cudastruct->kernelchoice = kernel;
658   }
659   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
660   if (flg) {
661     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel);
662     cudastruct->blocky = blocky;
663   }
664   PetscOptionsHeadEnd();
665   PetscFunctionReturn(PETSC_SUCCESS);
666 }
667 
668 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
669 {
670   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
671 
672   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
673   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
674   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
678 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
679 {
680   PetscFunctionBegin;
681   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
682   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
683   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
684   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
685   A->ops->mult    = MatMult_SeqSELLCUDA;
686   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
687   PetscFunctionReturn(PETSC_SUCCESS);
688 }
689 
690 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
691 {
692   PetscFunctionBegin;
693   if (A->factortype == MAT_FACTOR_NONE) {
694     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
695   }
696   PetscCall(MatDestroy_SeqSELL(A));
697   PetscFunctionReturn(PETSC_SUCCESS);
698 }
699 
700 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
701 static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
702 {
703   PetscFunctionBegin;
704   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
705   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
706   PetscFunctionReturn(PETSC_SUCCESS);
707 }
708 
709 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
710 {
711   Mat_SeqSELLCUDA *cudastruct;
712 
713   PetscFunctionBegin;
714   PetscCall(PetscFree(B->defaultvectype));
715   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
716 
717   if (!B->spptr) {
718     if (B->factortype == MAT_FACTOR_NONE) {
719       PetscCall(PetscNew(&cudastruct));
720       B->spptr = cudastruct;
721     }
722   }
723 
724   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
725   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
726   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
727   B->ops->mult           = MatMult_SeqSELLCUDA;
728   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
729   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
730 
731   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
732   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
733 
734   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
735   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
736   PetscFunctionReturn(PETSC_SUCCESS);
737 }
738 
739 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
740 {
741   PetscFunctionBegin;
742   PetscCall(MatCreate_SeqSELL(B));
743   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
744   PetscFunctionReturn(PETSC_SUCCESS);
745 }
746