xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 07e43b41f7cb79bba1664e5473a3e2df04f149fa)
12d1451d4SHong Zhang #include <cuda_runtime.h>
22d1451d4SHong Zhang 
32d1451d4SHong Zhang #include <petscdevice_cuda.h>
42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
52d1451d4SHong Zhang 
6*07e43b41SHong Zhang #define SLICE_HEIGHT 16
7*07e43b41SHong Zhang 
82d1451d4SHong Zhang typedef struct {
92d1451d4SHong Zhang   PetscInt  *colidx; /* column index */
102d1451d4SHong Zhang   MatScalar *val;
112d1451d4SHong Zhang   PetscInt  *sliidx;
122d1451d4SHong Zhang   PetscInt   nonzerostate;
13*07e43b41SHong Zhang   PetscInt   kernelchoice;
142d1451d4SHong Zhang } Mat_SeqSELLCUDA;
152d1451d4SHong Zhang 
162d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
172d1451d4SHong Zhang {
182d1451d4SHong Zhang   PetscFunctionBegin;
192d1451d4SHong Zhang   if (*cudastruct) {
202d1451d4SHong Zhang     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
212d1451d4SHong Zhang     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
222d1451d4SHong Zhang     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
232d1451d4SHong Zhang     PetscCall(PetscFree(*cudastruct));
242d1451d4SHong Zhang   }
252d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
262d1451d4SHong Zhang }
272d1451d4SHong Zhang 
282d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
292d1451d4SHong Zhang {
302d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
312d1451d4SHong Zhang   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
322d1451d4SHong Zhang 
332d1451d4SHong Zhang   PetscFunctionBegin;
342d1451d4SHong Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
352d1451d4SHong Zhang     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
362d1451d4SHong Zhang     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
372d1451d4SHong Zhang       /* copy values only */
382d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
392d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
402d1451d4SHong Zhang     } else {
412d1451d4SHong Zhang       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
422d1451d4SHong Zhang       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
432d1451d4SHong Zhang       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
442d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
452d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
462d1451d4SHong Zhang       /* copy values, nz or maxallocmat? */
472d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
482d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
49*07e43b41SHong Zhang 
50*07e43b41SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
51*07e43b41SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
522d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
532d1451d4SHong Zhang       cudastruct->nonzerostate = A->nonzerostate;
542d1451d4SHong Zhang     }
552d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
562d1451d4SHong Zhang     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
572d1451d4SHong Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
582d1451d4SHong Zhang   }
592d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
602d1451d4SHong Zhang }
612d1451d4SHong Zhang 
622d1451d4SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
632d1451d4SHong Zhang {
642d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
652d1451d4SHong Zhang   MatScalar sum;
662d1451d4SHong Zhang   /* one thread per row. */
672d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
682d1451d4SHong Zhang   if (row < nrows) {
692d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
702d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
712d1451d4SHong Zhang     sum          = 0.0;
722d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
732d1451d4SHong Zhang     y[row] = sum;
742d1451d4SHong Zhang   }
752d1451d4SHong Zhang }
762d1451d4SHong Zhang 
772d1451d4SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
782d1451d4SHong Zhang {
792d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
802d1451d4SHong Zhang   MatScalar sum;
812d1451d4SHong Zhang   /* one thread per row. */
822d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
832d1451d4SHong Zhang   if (row < nrows) {
842d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
852d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
862d1451d4SHong Zhang     sum          = 0.0;
872d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
882d1451d4SHong Zhang     z[row] = y[row] + sum;
892d1451d4SHong Zhang   }
902d1451d4SHong Zhang }
91*07e43b41SHong Zhang 
92*07e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */
93*07e43b41SHong Zhang template <int BLOCKY>
94*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
95*07e43b41SHong Zhang {
96*07e43b41SHong Zhang   __shared__ MatScalar shared[SLICE_HEIGHT][BLOCKY];
97*07e43b41SHong Zhang   PetscInt             i, row, slice_id;
98*07e43b41SHong Zhang   slice_id = blockIdx.x;
99*07e43b41SHong Zhang   row      = slice_id * SLICE_HEIGHT + threadIdx.x;
100*07e43b41SHong Zhang   int tid  = threadIdx.x + threadIdx.y * SLICE_HEIGHT;
101*07e43b41SHong Zhang   /* transposed index */
102*07e43b41SHong Zhang   int         tidx = tid % BLOCKY;
103*07e43b41SHong Zhang   int         tidy = tid / BLOCKY;
104*07e43b41SHong Zhang   PetscScalar t    = 0.0;
105*07e43b41SHong Zhang   if (row < nrows) {
106*07e43b41SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * BLOCKY) t += aval[i] * x[acolidx[i]];
1072d1451d4SHong Zhang   }
108*07e43b41SHong Zhang   /* transpose layout to reduce each row using warp shfl */
109*07e43b41SHong Zhang   shared[threadIdx.x][threadIdx.y] = t;
110*07e43b41SHong Zhang   __syncthreads();
111*07e43b41SHong Zhang   t = shared[tidy][tidx];
112*07e43b41SHong Zhang #pragma unroll
113*07e43b41SHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
114*07e43b41SHong Zhang   __syncthreads();
115*07e43b41SHong Zhang   if (tidx == 0) { shared[0][tidy] = t; }
116*07e43b41SHong Zhang   __syncthreads();
117*07e43b41SHong Zhang   if (row < nrows && threadIdx.y == 0) { y[row] = shared[0][threadIdx.x]; }
1182d1451d4SHong Zhang }
1192d1451d4SHong Zhang 
120*07e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */
121*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
1222d1451d4SHong Zhang {
123*07e43b41SHong Zhang   PetscInt i, row, slice_id;
124*07e43b41SHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
125*07e43b41SHong Zhang   row      = slice_id * SLICE_HEIGHT + threadIdx.x % SLICE_HEIGHT;
126*07e43b41SHong Zhang   double t = 0.0;
127*07e43b41SHong Zhang   if (row < nrows) {
128*07e43b41SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
129*07e43b41SHong Zhang   }
130*07e43b41SHong Zhang   t += __shfl_down_sync(0xffffffff, t, 16);
131*07e43b41SHong Zhang   if (row < nrows && threadIdx.x < 16) { y[row] = t; }
132*07e43b41SHong Zhang }
133*07e43b41SHong Zhang 
134*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
135*07e43b41SHong Zhang {
136*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
1372d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
138*07e43b41SHong Zhang   /* multiple threads per row. */
1392d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
1402d1451d4SHong Zhang   if (row < nrows) {
1412d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
1422d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
1432d1451d4SHong Zhang 
1442d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
1452d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
146*07e43b41SHong Zhang     __syncthreads();
147*07e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
148*07e43b41SHong Zhang     __syncthreads();
149*07e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
1502d1451d4SHong Zhang     __syncthreads();
1512d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
1522d1451d4SHong Zhang     __syncthreads();
1532d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
1542d1451d4SHong Zhang     __syncthreads();
1552d1451d4SHong Zhang     if (threadIdx.y < 1) {
156*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
1572d1451d4SHong Zhang       y[row] = shared[threadIdx.x];
1582d1451d4SHong Zhang     }
1592d1451d4SHong Zhang   }
1602d1451d4SHong Zhang }
1612d1451d4SHong Zhang 
162*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
1632d1451d4SHong Zhang {
164*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
1652d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
166*07e43b41SHong Zhang   /* multiple threads per row. */
1672d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
1682d1451d4SHong Zhang   if (row < nrows) {
1692d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
1702d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
1712d1451d4SHong Zhang 
1722d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
1732d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
174*07e43b41SHong Zhang     __syncthreads();
175*07e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
1762d1451d4SHong Zhang     __syncthreads();
1772d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
1782d1451d4SHong Zhang     __syncthreads();
1792d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
1802d1451d4SHong Zhang     __syncthreads();
1812d1451d4SHong Zhang     if (threadIdx.y < 1) {
182*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
183*07e43b41SHong Zhang       y[row] = shared[threadIdx.x];
184*07e43b41SHong Zhang     }
185*07e43b41SHong Zhang   }
186*07e43b41SHong Zhang }
187*07e43b41SHong Zhang 
188*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
189*07e43b41SHong Zhang {
190*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
191*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
192*07e43b41SHong Zhang   /* multiple threads per row. */
193*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
194*07e43b41SHong Zhang   if (row < nrows) {
195*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
196*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
197*07e43b41SHong Zhang 
198*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
199*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
200*07e43b41SHong Zhang     __syncthreads();
201*07e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
202*07e43b41SHong Zhang     __syncthreads();
203*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
204*07e43b41SHong Zhang     __syncthreads();
205*07e43b41SHong Zhang     if (threadIdx.y < 1) {
206*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
207*07e43b41SHong Zhang       y[row] = shared[threadIdx.x];
208*07e43b41SHong Zhang     }
209*07e43b41SHong Zhang   }
210*07e43b41SHong Zhang }
211*07e43b41SHong Zhang 
212*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
213*07e43b41SHong Zhang {
214*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
215*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
216*07e43b41SHong Zhang   /* multiple threads per row. */
217*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
218*07e43b41SHong Zhang   if (row < nrows) {
219*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
220*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
221*07e43b41SHong Zhang 
222*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
223*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
224*07e43b41SHong Zhang     __syncthreads();
225*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
226*07e43b41SHong Zhang     __syncthreads();
227*07e43b41SHong Zhang     if (threadIdx.y < 1) {
228*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
229*07e43b41SHong Zhang       y[row] = shared[threadIdx.x];
230*07e43b41SHong Zhang     }
231*07e43b41SHong Zhang   }
232*07e43b41SHong Zhang }
233*07e43b41SHong Zhang 
234*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
235*07e43b41SHong Zhang {
236*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
237*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
238*07e43b41SHong Zhang   /* multiple threads per row. */
239*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
240*07e43b41SHong Zhang   if (row < nrows) {
241*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
242*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
243*07e43b41SHong Zhang 
244*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
245*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
246*07e43b41SHong Zhang     __syncthreads();
247*07e43b41SHong Zhang     if (threadIdx.y < 1) {
248*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
249*07e43b41SHong Zhang       y[row] = shared[threadIdx.x];
250*07e43b41SHong Zhang     }
251*07e43b41SHong Zhang   }
252*07e43b41SHong Zhang }
253*07e43b41SHong Zhang 
254*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
255*07e43b41SHong Zhang {
256*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
257*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
258*07e43b41SHong Zhang   /* multiple threads per row. */
259*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
260*07e43b41SHong Zhang   if (row < nrows) {
261*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
262*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
263*07e43b41SHong Zhang 
264*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
265*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
266*07e43b41SHong Zhang     __syncthreads();
267*07e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
268*07e43b41SHong Zhang     __syncthreads();
269*07e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
270*07e43b41SHong Zhang     __syncthreads();
271*07e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
272*07e43b41SHong Zhang     __syncthreads();
273*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
274*07e43b41SHong Zhang     __syncthreads();
275*07e43b41SHong Zhang     if (threadIdx.y < 1) {
276*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
2772d1451d4SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
2782d1451d4SHong Zhang     }
2792d1451d4SHong Zhang   }
2802d1451d4SHong Zhang }
281*07e43b41SHong Zhang 
282*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
283*07e43b41SHong Zhang {
284*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
285*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
286*07e43b41SHong Zhang   /* multiple threads per row. */
287*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
288*07e43b41SHong Zhang   if (row < nrows) {
289*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
290*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
291*07e43b41SHong Zhang 
292*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
293*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
294*07e43b41SHong Zhang     __syncthreads();
295*07e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
296*07e43b41SHong Zhang     __syncthreads();
297*07e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
298*07e43b41SHong Zhang     __syncthreads();
299*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
300*07e43b41SHong Zhang     __syncthreads();
301*07e43b41SHong Zhang     if (threadIdx.y < 1) {
302*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
303*07e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
304*07e43b41SHong Zhang     }
305*07e43b41SHong Zhang   }
306*07e43b41SHong Zhang }
307*07e43b41SHong Zhang 
308*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
309*07e43b41SHong Zhang {
310*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
311*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
312*07e43b41SHong Zhang   /* multiple threads per row. */
313*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
314*07e43b41SHong Zhang   if (row < nrows) {
315*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
316*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
317*07e43b41SHong Zhang 
318*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
319*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
320*07e43b41SHong Zhang     __syncthreads();
321*07e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
322*07e43b41SHong Zhang     __syncthreads();
323*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
324*07e43b41SHong Zhang     __syncthreads();
325*07e43b41SHong Zhang     if (threadIdx.y < 1) {
326*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
327*07e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
328*07e43b41SHong Zhang     }
329*07e43b41SHong Zhang   }
330*07e43b41SHong Zhang }
331*07e43b41SHong Zhang 
332*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
333*07e43b41SHong Zhang {
334*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
335*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
336*07e43b41SHong Zhang   /* multiple threads per row. */
337*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
338*07e43b41SHong Zhang   if (row < nrows) {
339*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
340*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
341*07e43b41SHong Zhang 
342*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
343*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
344*07e43b41SHong Zhang     __syncthreads();
345*07e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
346*07e43b41SHong Zhang     __syncthreads();
347*07e43b41SHong Zhang     if (threadIdx.y < 1) {
348*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349*07e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
350*07e43b41SHong Zhang     }
351*07e43b41SHong Zhang   }
352*07e43b41SHong Zhang }
353*07e43b41SHong Zhang 
354*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
355*07e43b41SHong Zhang {
356*07e43b41SHong Zhang   __shared__ MatScalar shared[512];
357*07e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
358*07e43b41SHong Zhang   /* multiple threads per row. */
359*07e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
360*07e43b41SHong Zhang   if (row < nrows) {
361*07e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
362*07e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
363*07e43b41SHong Zhang 
364*07e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365*07e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366*07e43b41SHong Zhang     __syncthreads();
367*07e43b41SHong Zhang     if (threadIdx.y < 1) {
368*07e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
369*07e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
370*07e43b41SHong Zhang     }
371*07e43b41SHong Zhang   }
3722d1451d4SHong Zhang }
3732d1451d4SHong Zhang 
3742d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
3752d1451d4SHong Zhang {
3762d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
3772d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
3782d1451d4SHong Zhang   PetscScalar       *y;
3792d1451d4SHong Zhang   const PetscScalar *x;
3802d1451d4SHong Zhang   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
3812d1451d4SHong Zhang   MatScalar         *aval;
3822d1451d4SHong Zhang   PetscInt          *acolidx;
3832d1451d4SHong Zhang   PetscInt          *sliidx;
384*07e43b41SHong Zhang   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
385*07e43b41SHong Zhang   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
386*07e43b41SHong Zhang   dim3               block_k8(32, SLICE_HEIGHT);
3872d1451d4SHong Zhang 
3882d1451d4SHong Zhang   PetscFunctionBegin;
389*07e43b41SHong Zhang   PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
3902d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
3912d1451d4SHong Zhang   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
3922d1451d4SHong Zhang   aval    = cudastruct->val;
3932d1451d4SHong Zhang   acolidx = cudastruct->colidx;
3942d1451d4SHong Zhang   sliidx  = cudastruct->sliidx;
3952d1451d4SHong Zhang 
3962d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayRead(xx, &x));
3972d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayWrite(yy, &y));
3982d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeBegin());
399*07e43b41SHong Zhang 
400*07e43b41SHong Zhang   switch (cudastruct->kernelchoice) {
401*07e43b41SHong Zhang   case 9:
402*07e43b41SHong Zhang     if (a->maxslicewidth > 512) {
403*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
404*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
405*07e43b41SHong Zhang     } else if (a->avgslicewidth < 8) {
406*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
407*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
408*07e43b41SHong Zhang     } else if (a->avgslicewidth < 16) {
409*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
410*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
411*07e43b41SHong Zhang     } else if (a->avgslicewidth < 32) {
412*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
413*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
414*07e43b41SHong Zhang     } else if (a->avgslicewidth < 64) {
415*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
416*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
417*07e43b41SHong Zhang     } else {
418*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
419*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
420*07e43b41SHong Zhang     }
421*07e43b41SHong Zhang     break;
422*07e43b41SHong Zhang   case 8:
423*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
424*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
425*07e43b41SHong Zhang     break;
426*07e43b41SHong Zhang   case 7:
427*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
428*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
429*07e43b41SHong Zhang     break;
430*07e43b41SHong Zhang   case 6:
431*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
432*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
433*07e43b41SHong Zhang     break;
434*07e43b41SHong Zhang   case 5:
435*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
436*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
437*07e43b41SHong Zhang     break;
438*07e43b41SHong Zhang   case 4:
439*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
440*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
441*07e43b41SHong Zhang     break;
442*07e43b41SHong Zhang   case 3:
443*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
444*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
445*07e43b41SHong Zhang     break;
446*07e43b41SHong Zhang   case 2: /* 16 slices per block if blocksize=512 */
447*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 2);
448*07e43b41SHong Zhang     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
449*07e43b41SHong Zhang     break;
450*07e43b41SHong Zhang   case 1: /* 32 slices per block if blocksize=512 */
451*07e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / blocksize;
4522d1451d4SHong Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
453*07e43b41SHong Zhang     break;
454*07e43b41SHong Zhang   case 0:
455*07e43b41SHong Zhang     if (a->maxslicewidth > 64) {
456*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
457*07e43b41SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
4582d1451d4SHong Zhang     } else {
459*07e43b41SHong Zhang       if (a->avgslicewidth < 8) {
460*07e43b41SHong Zhang         nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT);
461*07e43b41SHong Zhang         matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
462*07e43b41SHong Zhang       } else if (a->avgslicewidth < 16) {
463*07e43b41SHong Zhang         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
464*07e43b41SHong Zhang         matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
465*07e43b41SHong Zhang       } else if (a->avgslicewidth < 32) {
466*07e43b41SHong Zhang         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
467*07e43b41SHong Zhang         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
4682d1451d4SHong Zhang       } else {
469*07e43b41SHong Zhang         nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT);
470*07e43b41SHong Zhang         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
4712d1451d4SHong Zhang       }
4722d1451d4SHong Zhang     }
473*07e43b41SHong Zhang     break;
474*07e43b41SHong Zhang   }
4752d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeEnd());
4762d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayRead(xx, &x));
4772d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
4782d1451d4SHong Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
4792d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
4802d1451d4SHong Zhang }
4812d1451d4SHong Zhang 
4822d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
4832d1451d4SHong Zhang {
4842d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
4852d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
4862d1451d4SHong Zhang   PetscScalar       *z;
4872d1451d4SHong Zhang   const PetscScalar *y, *x;
4882d1451d4SHong Zhang   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
4892d1451d4SHong Zhang   MatScalar         *aval    = cudastruct->val;
4902d1451d4SHong Zhang   PetscInt          *acolidx = cudastruct->colidx;
4912d1451d4SHong Zhang   PetscInt          *sliidx  = cudastruct->sliidx;
4922d1451d4SHong Zhang 
4932d1451d4SHong Zhang   PetscFunctionBegin;
494*07e43b41SHong Zhang   PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
4952d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
4962d1451d4SHong Zhang   if (a->nz) {
497*07e43b41SHong Zhang     PetscInt nblocks, blocksize = 512;
498*07e43b41SHong Zhang     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
4992d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(xx, &x));
5002d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(yy, &y));
5012d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayWrite(zz, &z));
5022d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeBegin());
503*07e43b41SHong Zhang 
504*07e43b41SHong Zhang     switch (cudastruct->kernelchoice) {
505*07e43b41SHong Zhang     case 6:
506*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 32);
507*07e43b41SHong Zhang       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
508*07e43b41SHong Zhang       break;
509*07e43b41SHong Zhang     case 5:
510*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 16);
511*07e43b41SHong Zhang       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
512*07e43b41SHong Zhang       break;
513*07e43b41SHong Zhang     case 4:
514*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 8);
515*07e43b41SHong Zhang       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
516*07e43b41SHong Zhang       break;
517*07e43b41SHong Zhang     case 3:
518*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 4);
519*07e43b41SHong Zhang       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
520*07e43b41SHong Zhang       break;
521*07e43b41SHong Zhang     case 2:
522*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 2);
523*07e43b41SHong Zhang       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
524*07e43b41SHong Zhang       break;
525*07e43b41SHong Zhang     case 1:
526*07e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / blocksize;
5272d1451d4SHong Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
528*07e43b41SHong Zhang       break;
529*07e43b41SHong Zhang     case 0: /* TODO */
530*07e43b41SHong Zhang       break;
5312d1451d4SHong Zhang     }
5322d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeEnd());
5332d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(xx, &x));
5342d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(yy, &y));
5352d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
5362d1451d4SHong Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
5372d1451d4SHong Zhang   } else {
5382d1451d4SHong Zhang     PetscCall(VecCopy(yy, zz));
5392d1451d4SHong Zhang   }
5402d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5412d1451d4SHong Zhang }
5422d1451d4SHong Zhang 
5432d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
5442d1451d4SHong Zhang {
545*07e43b41SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
546*07e43b41SHong Zhang   PetscInt         kernel;
547*07e43b41SHong Zhang   PetscBool        flg;
548*07e43b41SHong Zhang 
5492d1451d4SHong Zhang   PetscFunctionBegin;
5502d1451d4SHong Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
551*07e43b41SHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
552*07e43b41SHong Zhang   if (flg) {
553*07e43b41SHong Zhang     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
554*07e43b41SHong Zhang     cudastruct->kernelchoice = kernel;
555*07e43b41SHong Zhang   }
5562d1451d4SHong Zhang   PetscOptionsHeadEnd();
5572d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5582d1451d4SHong Zhang }
5592d1451d4SHong Zhang 
560*07e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
561*07e43b41SHong Zhang {
562*07e43b41SHong Zhang   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
563*07e43b41SHong Zhang 
564*07e43b41SHong Zhang   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
565*07e43b41SHong Zhang   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
566*07e43b41SHong Zhang   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
567*07e43b41SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
568*07e43b41SHong Zhang }
569*07e43b41SHong Zhang 
5702d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
5712d1451d4SHong Zhang {
5722d1451d4SHong Zhang   PetscFunctionBegin;
5732d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
574*07e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
5752d1451d4SHong Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
5762d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
5772d1451d4SHong Zhang   A->ops->mult    = MatMult_SeqSELLCUDA;
5782d1451d4SHong Zhang   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
5792d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5802d1451d4SHong Zhang }
5812d1451d4SHong Zhang 
5822d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
5832d1451d4SHong Zhang {
5842d1451d4SHong Zhang   PetscFunctionBegin;
5852d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) {
5862d1451d4SHong Zhang     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
5872d1451d4SHong Zhang   }
5882d1451d4SHong Zhang   PetscCall(MatDestroy_SeqSELL(A));
5892d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5902d1451d4SHong Zhang }
5912d1451d4SHong Zhang 
5922d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
5932d1451d4SHong Zhang static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
5942d1451d4SHong Zhang {
5952d1451d4SHong Zhang   PetscFunctionBegin;
5962d1451d4SHong Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
5972d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
5982d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5992d1451d4SHong Zhang }
6002d1451d4SHong Zhang 
6012d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
6022d1451d4SHong Zhang {
6032d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct;
6042d1451d4SHong Zhang 
6052d1451d4SHong Zhang   PetscFunctionBegin;
6062d1451d4SHong Zhang   PetscCall(PetscFree(B->defaultvectype));
6072d1451d4SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
6082d1451d4SHong Zhang 
6092d1451d4SHong Zhang   if (!B->spptr) {
6102d1451d4SHong Zhang     if (B->factortype == MAT_FACTOR_NONE) {
6112d1451d4SHong Zhang       PetscCall(PetscNew(&cudastruct));
6122d1451d4SHong Zhang       B->spptr = cudastruct;
6132d1451d4SHong Zhang     }
6142d1451d4SHong Zhang   }
6152d1451d4SHong Zhang 
6162d1451d4SHong Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
6172d1451d4SHong Zhang   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
6182d1451d4SHong Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
6192d1451d4SHong Zhang   B->ops->mult           = MatMult_SeqSELLCUDA;
6202d1451d4SHong Zhang   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
6212d1451d4SHong Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
6222d1451d4SHong Zhang 
623*07e43b41SHong Zhang   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
624*07e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
625*07e43b41SHong Zhang 
6262d1451d4SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
6272d1451d4SHong Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
6282d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6292d1451d4SHong Zhang }
6302d1451d4SHong Zhang 
6312d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
6322d1451d4SHong Zhang {
6332d1451d4SHong Zhang   PetscFunctionBegin;
6342d1451d4SHong Zhang   PetscCall(MatCreate_SeqSELL(B));
6352d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
6362d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6372d1451d4SHong Zhang }
638