xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision cca9ff8b0ed15f9f7ff3d370e0ced223f69d26ef)
12d1451d4SHong Zhang #include <cuda_runtime.h>
22d1451d4SHong Zhang 
32d1451d4SHong Zhang #include <petscdevice_cuda.h>
42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
52d1451d4SHong Zhang 
607e43b41SHong Zhang #define SLICE_HEIGHT 16
707e43b41SHong Zhang 
82d1451d4SHong Zhang typedef struct {
92d1451d4SHong Zhang   PetscInt  *colidx; /* column index */
102d1451d4SHong Zhang   MatScalar *val;
112d1451d4SHong Zhang   PetscInt  *sliidx;
122d1451d4SHong Zhang   PetscInt   nonzerostate;
1307e43b41SHong Zhang   PetscInt   kernelchoice;
144e58db63SHong Zhang   PetscInt   blocky;
152d1451d4SHong Zhang } Mat_SeqSELLCUDA;
162d1451d4SHong Zhang 
172d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
182d1451d4SHong Zhang {
192d1451d4SHong Zhang   PetscFunctionBegin;
202d1451d4SHong Zhang   if (*cudastruct) {
212d1451d4SHong Zhang     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
222d1451d4SHong Zhang     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
232d1451d4SHong Zhang     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
242d1451d4SHong Zhang     PetscCall(PetscFree(*cudastruct));
252d1451d4SHong Zhang   }
262d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
272d1451d4SHong Zhang }
282d1451d4SHong Zhang 
292d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
302d1451d4SHong Zhang {
312d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
322d1451d4SHong Zhang   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
332d1451d4SHong Zhang 
342d1451d4SHong Zhang   PetscFunctionBegin;
352d1451d4SHong Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
362d1451d4SHong Zhang     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
372d1451d4SHong Zhang     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
382d1451d4SHong Zhang       /* copy values only */
392d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
402d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
412d1451d4SHong Zhang     } else {
422d1451d4SHong Zhang       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
432d1451d4SHong Zhang       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
442d1451d4SHong Zhang       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
452d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
462d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
472d1451d4SHong Zhang       /* copy values, nz or maxallocmat? */
482d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
492d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
5007e43b41SHong Zhang 
5107e43b41SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
5207e43b41SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
532d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
542d1451d4SHong Zhang       cudastruct->nonzerostate = A->nonzerostate;
552d1451d4SHong Zhang     }
562d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
572d1451d4SHong Zhang     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
582d1451d4SHong Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
592d1451d4SHong Zhang   }
602d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
612d1451d4SHong Zhang }
622d1451d4SHong Zhang 
634e58db63SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
642d1451d4SHong Zhang {
652d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
662d1451d4SHong Zhang   MatScalar sum;
672d1451d4SHong Zhang   /* one thread per row. */
682d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
692d1451d4SHong Zhang   if (row < nrows) {
704e58db63SHong Zhang     slice_id     = row / sliceheight;
714e58db63SHong Zhang     row_in_slice = row % sliceheight;
722d1451d4SHong Zhang     sum          = 0.0;
734e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
742d1451d4SHong Zhang     y[row] = sum;
752d1451d4SHong Zhang   }
762d1451d4SHong Zhang }
772d1451d4SHong Zhang 
784e58db63SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
792d1451d4SHong Zhang {
802d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
812d1451d4SHong Zhang   MatScalar sum;
822d1451d4SHong Zhang   /* one thread per row. */
832d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
842d1451d4SHong Zhang   if (row < nrows) {
854e58db63SHong Zhang     slice_id     = row / sliceheight;
864e58db63SHong Zhang     row_in_slice = row % sliceheight;
872d1451d4SHong Zhang     sum          = 0.0;
884e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
892d1451d4SHong Zhang     z[row] = y[row] + sum;
902d1451d4SHong Zhang   }
912d1451d4SHong Zhang }
9207e43b41SHong Zhang 
9307e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */
9407e43b41SHong Zhang template <int BLOCKY>
954e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
9607e43b41SHong Zhang {
974e58db63SHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
984e58db63SHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
994e58db63SHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
10007e43b41SHong Zhang   /* transposed index */
10107e43b41SHong Zhang   int         tidx = tid % BLOCKY;
10207e43b41SHong Zhang   int         tidy = tid / BLOCKY;
10307e43b41SHong Zhang   PetscScalar t    = 0.0;
1044e58db63SHong Zhang 
1054e58db63SHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
10607e43b41SHong Zhang   if (row < nrows) {
1074e58db63SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
1082d1451d4SHong Zhang   }
1094e58db63SHong Zhang #pragma unroll
1104e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
11107e43b41SHong Zhang   /* transpose layout to reduce each row using warp shfl */
11207e43b41SHong Zhang   shared[threadIdx.x][threadIdx.y] = t;
11307e43b41SHong Zhang   __syncthreads();
11407e43b41SHong Zhang   t = shared[tidy][tidx];
11507e43b41SHong Zhang #pragma unroll
11607e43b41SHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
1174e58db63SHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
11807e43b41SHong Zhang   __syncthreads();
1194e58db63SHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
1202d1451d4SHong Zhang }
1212d1451d4SHong Zhang 
122*cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */
123*cca9ff8bSHong Zhang template <int BLOCKY>
124*cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
125*cca9ff8bSHong Zhang {
126*cca9ff8bSHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
127*cca9ff8bSHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
128*cca9ff8bSHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
129*cca9ff8bSHong Zhang   /* transposed index */
130*cca9ff8bSHong Zhang   int         tidx = tid % BLOCKY;
131*cca9ff8bSHong Zhang   int         tidy = tid / BLOCKY;
132*cca9ff8bSHong Zhang   PetscScalar t    = 0.0;
133*cca9ff8bSHong Zhang 
134*cca9ff8bSHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
135*cca9ff8bSHong Zhang   if (row < nrows) {
136*cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
137*cca9ff8bSHong Zhang   }
138*cca9ff8bSHong Zhang #pragma unroll
139*cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
140*cca9ff8bSHong Zhang   /* transpose layout to reduce each row using warp shfl */
141*cca9ff8bSHong Zhang   shared[threadIdx.x][threadIdx.y] = t;
142*cca9ff8bSHong Zhang   __syncthreads();
143*cca9ff8bSHong Zhang   t = shared[tidy][tidx];
144*cca9ff8bSHong Zhang #pragma unroll
145*cca9ff8bSHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
146*cca9ff8bSHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
147*cca9ff8bSHong Zhang   __syncthreads();
148*cca9ff8bSHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
149*cca9ff8bSHong Zhang }
150*cca9ff8bSHong Zhang 
15107e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */
1524e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
1532d1451d4SHong Zhang {
15407e43b41SHong Zhang   PetscInt i, row, slice_id;
15507e43b41SHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
1564e58db63SHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
15707e43b41SHong Zhang   double t = 0.0;
15807e43b41SHong Zhang   if (row < nrows) {
15907e43b41SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
16007e43b41SHong Zhang   }
1614e58db63SHong Zhang #pragma unroll
1624e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
1634e58db63SHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
16407e43b41SHong Zhang }
16507e43b41SHong Zhang 
166*cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */
167*cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
168*cca9ff8bSHong Zhang {
169*cca9ff8bSHong Zhang   PetscInt i, row, slice_id;
170*cca9ff8bSHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
171*cca9ff8bSHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
172*cca9ff8bSHong Zhang   double t = 0.0;
173*cca9ff8bSHong Zhang   if (row < nrows) {
174*cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
175*cca9ff8bSHong Zhang   }
176*cca9ff8bSHong Zhang #pragma unroll
177*cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
178*cca9ff8bSHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
179*cca9ff8bSHong Zhang }
180*cca9ff8bSHong Zhang 
181a9dd396cSHong Zhang /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
182a9dd396cSHong Zhang 
183a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
18407e43b41SHong Zhang {
18507e43b41SHong Zhang   __shared__ MatScalar shared[512];
1862d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
18707e43b41SHong Zhang   /* multiple threads per row. */
1882d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
1892d1451d4SHong Zhang   if (row < nrows) {
1902d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
1912d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
1922d1451d4SHong Zhang 
1932d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
1942d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
19507e43b41SHong Zhang     __syncthreads();
19607e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
19707e43b41SHong Zhang     __syncthreads();
19807e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
1992d1451d4SHong Zhang     __syncthreads();
2002d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
2012d1451d4SHong Zhang     __syncthreads();
2022d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
2032d1451d4SHong Zhang     __syncthreads();
2042d1451d4SHong Zhang     if (threadIdx.y < 1) {
20507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
2062d1451d4SHong Zhang       y[row] = shared[threadIdx.x];
2072d1451d4SHong Zhang     }
2082d1451d4SHong Zhang   }
2092d1451d4SHong Zhang }
2102d1451d4SHong Zhang 
211a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
2122d1451d4SHong Zhang {
21307e43b41SHong Zhang   __shared__ MatScalar shared[512];
2142d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
21507e43b41SHong Zhang   /* multiple threads per row. */
2162d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
2172d1451d4SHong Zhang   if (row < nrows) {
2182d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
2192d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
2202d1451d4SHong Zhang 
2212d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
2222d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
22307e43b41SHong Zhang     __syncthreads();
22407e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
2252d1451d4SHong Zhang     __syncthreads();
2262d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
2272d1451d4SHong Zhang     __syncthreads();
2282d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
2292d1451d4SHong Zhang     __syncthreads();
2302d1451d4SHong Zhang     if (threadIdx.y < 1) {
23107e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
23207e43b41SHong Zhang       y[row] = shared[threadIdx.x];
23307e43b41SHong Zhang     }
23407e43b41SHong Zhang   }
23507e43b41SHong Zhang }
23607e43b41SHong Zhang 
237a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
23807e43b41SHong Zhang {
23907e43b41SHong Zhang   __shared__ MatScalar shared[512];
24007e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
24107e43b41SHong Zhang   /* multiple threads per row. */
24207e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
24307e43b41SHong Zhang   if (row < nrows) {
24407e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
24507e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
24607e43b41SHong Zhang 
24707e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
24807e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
24907e43b41SHong Zhang     __syncthreads();
25007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
25107e43b41SHong Zhang     __syncthreads();
25207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
25307e43b41SHong Zhang     __syncthreads();
25407e43b41SHong Zhang     if (threadIdx.y < 1) {
25507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
25607e43b41SHong Zhang       y[row] = shared[threadIdx.x];
25707e43b41SHong Zhang     }
25807e43b41SHong Zhang   }
25907e43b41SHong Zhang }
26007e43b41SHong Zhang 
261a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
26207e43b41SHong Zhang {
26307e43b41SHong Zhang   __shared__ MatScalar shared[512];
26407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
26507e43b41SHong Zhang   /* multiple threads per row. */
26607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
26707e43b41SHong Zhang   if (row < nrows) {
26807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
26907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
27007e43b41SHong Zhang 
27107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
27207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
27307e43b41SHong Zhang     __syncthreads();
27407e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
27507e43b41SHong Zhang     __syncthreads();
27607e43b41SHong Zhang     if (threadIdx.y < 1) {
27707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
27807e43b41SHong Zhang       y[row] = shared[threadIdx.x];
27907e43b41SHong Zhang     }
28007e43b41SHong Zhang   }
28107e43b41SHong Zhang }
28207e43b41SHong Zhang 
283a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
28407e43b41SHong Zhang {
28507e43b41SHong Zhang   __shared__ MatScalar shared[512];
28607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
28707e43b41SHong Zhang   /* multiple threads per row. */
28807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
28907e43b41SHong Zhang   if (row < nrows) {
29007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
29107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
29207e43b41SHong Zhang 
29307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
29407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
29507e43b41SHong Zhang     __syncthreads();
29607e43b41SHong Zhang     if (threadIdx.y < 1) {
29707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
29807e43b41SHong Zhang       y[row] = shared[threadIdx.x];
29907e43b41SHong Zhang     }
30007e43b41SHong Zhang   }
30107e43b41SHong Zhang }
30207e43b41SHong Zhang 
303a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
30407e43b41SHong Zhang {
30507e43b41SHong Zhang   __shared__ MatScalar shared[512];
30607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
30707e43b41SHong Zhang   /* multiple threads per row. */
30807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
30907e43b41SHong Zhang   if (row < nrows) {
31007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
31107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
31207e43b41SHong Zhang 
31307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
31407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
31507e43b41SHong Zhang     __syncthreads();
31607e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
31707e43b41SHong Zhang     __syncthreads();
31807e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
31907e43b41SHong Zhang     __syncthreads();
32007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
32107e43b41SHong Zhang     __syncthreads();
32207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
32307e43b41SHong Zhang     __syncthreads();
32407e43b41SHong Zhang     if (threadIdx.y < 1) {
32507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
3262d1451d4SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
3272d1451d4SHong Zhang     }
3282d1451d4SHong Zhang   }
3292d1451d4SHong Zhang }
33007e43b41SHong Zhang 
331a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
33207e43b41SHong Zhang {
33307e43b41SHong Zhang   __shared__ MatScalar shared[512];
33407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
33507e43b41SHong Zhang   /* multiple threads per row. */
33607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
33707e43b41SHong Zhang   if (row < nrows) {
33807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
33907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
34007e43b41SHong Zhang 
34107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
34207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
34307e43b41SHong Zhang     __syncthreads();
34407e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
34507e43b41SHong Zhang     __syncthreads();
34607e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
34707e43b41SHong Zhang     __syncthreads();
34807e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
34907e43b41SHong Zhang     __syncthreads();
35007e43b41SHong Zhang     if (threadIdx.y < 1) {
35107e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
35207e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
35307e43b41SHong Zhang     }
35407e43b41SHong Zhang   }
35507e43b41SHong Zhang }
35607e43b41SHong Zhang 
357a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
35807e43b41SHong Zhang {
35907e43b41SHong Zhang   __shared__ MatScalar shared[512];
36007e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
36107e43b41SHong Zhang   /* multiple threads per row. */
36207e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
36307e43b41SHong Zhang   if (row < nrows) {
36407e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
36507e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
36607e43b41SHong Zhang 
36707e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
36807e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
36907e43b41SHong Zhang     __syncthreads();
37007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
37107e43b41SHong Zhang     __syncthreads();
37207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
37307e43b41SHong Zhang     __syncthreads();
37407e43b41SHong Zhang     if (threadIdx.y < 1) {
37507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
37607e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
37707e43b41SHong Zhang     }
37807e43b41SHong Zhang   }
37907e43b41SHong Zhang }
38007e43b41SHong Zhang 
381a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
38207e43b41SHong Zhang {
38307e43b41SHong Zhang   __shared__ MatScalar shared[512];
38407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
38507e43b41SHong Zhang   /* multiple threads per row. */
38607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
38707e43b41SHong Zhang   if (row < nrows) {
38807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
38907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
39007e43b41SHong Zhang 
39107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
39207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
39307e43b41SHong Zhang     __syncthreads();
39407e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
39507e43b41SHong Zhang     __syncthreads();
39607e43b41SHong Zhang     if (threadIdx.y < 1) {
39707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
39807e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
39907e43b41SHong Zhang     }
40007e43b41SHong Zhang   }
40107e43b41SHong Zhang }
40207e43b41SHong Zhang 
403a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
40407e43b41SHong Zhang {
40507e43b41SHong Zhang   __shared__ MatScalar shared[512];
40607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
40707e43b41SHong Zhang   /* multiple threads per row. */
40807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
40907e43b41SHong Zhang   if (row < nrows) {
41007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
41107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
41207e43b41SHong Zhang 
41307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
41407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
41507e43b41SHong Zhang     __syncthreads();
41607e43b41SHong Zhang     if (threadIdx.y < 1) {
41707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
41807e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
41907e43b41SHong Zhang     }
42007e43b41SHong Zhang   }
4212d1451d4SHong Zhang }
4222d1451d4SHong Zhang 
4232d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
4242d1451d4SHong Zhang {
4252d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
4262d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
4272d1451d4SHong Zhang   PetscScalar       *y;
4282d1451d4SHong Zhang   const PetscScalar *x;
429a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
4302d1451d4SHong Zhang   MatScalar         *aval;
4312d1451d4SHong Zhang   PetscInt          *acolidx;
4322d1451d4SHong Zhang   PetscInt          *sliidx;
43307e43b41SHong Zhang   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
43407e43b41SHong Zhang   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
4352d1451d4SHong Zhang 
4362d1451d4SHong Zhang   PetscFunctionBegin;
4374e58db63SHong Zhang   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
4382d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
4392d1451d4SHong Zhang   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
4402d1451d4SHong Zhang   aval    = cudastruct->val;
4412d1451d4SHong Zhang   acolidx = cudastruct->colidx;
4422d1451d4SHong Zhang   sliidx  = cudastruct->sliidx;
4432d1451d4SHong Zhang 
4442d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayRead(xx, &x));
4452d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayWrite(yy, &y));
4462d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeBegin());
44707e43b41SHong Zhang 
44807e43b41SHong Zhang   switch (cudastruct->kernelchoice) {
44907e43b41SHong Zhang   case 9:
4504e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / sliceheight;
4514e58db63SHong Zhang     if (cudastruct->blocky == 2) {
4524e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4534e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
4544e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4554e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
4564e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4574e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
4584e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4594e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
4604e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
461*cca9ff8bSHong Zhang     } else {
462*cca9ff8bSHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
46307e43b41SHong Zhang     }
46407e43b41SHong Zhang     break;
46507e43b41SHong Zhang   case 7:
4664e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
4674e58db63SHong Zhang     if (cudastruct->blocky == 2) {
4684e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4694e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
4704e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4714e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
4724e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4734e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
4744e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4754e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
4764e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4774e58db63SHong Zhang     } else {
4784e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
4794e58db63SHong Zhang     }
48007e43b41SHong Zhang     break;
48107e43b41SHong Zhang   case 6:
48207e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
483a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
48407e43b41SHong Zhang     break;
48507e43b41SHong Zhang   case 5:
48607e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
487a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
48807e43b41SHong Zhang     break;
48907e43b41SHong Zhang   case 4:
49007e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
491a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
49207e43b41SHong Zhang     break;
49307e43b41SHong Zhang   case 3:
49407e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
495a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
49607e43b41SHong Zhang     break;
49707e43b41SHong Zhang   case 2: /* 16 slices per block if blocksize=512 */
49807e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 2);
499a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
50007e43b41SHong Zhang     break;
50107e43b41SHong Zhang   case 1: /* 32 slices per block if blocksize=512 */
50207e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / blocksize;
5034e58db63SHong Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
50407e43b41SHong Zhang     break;
50507e43b41SHong Zhang   case 0:
5064e58db63SHong Zhang     if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */
5074e58db63SHong Zhang       nblocks = 1 + (nrows - 1) / sliceheight;
5084e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
5092d1451d4SHong Zhang     } else {
510*cca9ff8bSHong Zhang       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
511*cca9ff8bSHong Zhang       if (avgslicesize <= 96) {
512*cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
513*cca9ff8bSHong Zhang         matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
514*cca9ff8bSHong Zhang       } else if (avgslicesize <= 432) {
515*cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
516*cca9ff8bSHong Zhang         matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
517*cca9ff8bSHong Zhang       } else if (avgslicesize <= 2400) {
518*cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
519*cca9ff8bSHong Zhang         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
520*cca9ff8bSHong Zhang       } else {
5214e58db63SHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
5224e58db63SHong Zhang         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
5232d1451d4SHong Zhang       }
5242d1451d4SHong Zhang     }
525*cca9ff8bSHong Zhang     break;
526*cca9ff8bSHong Zhang   }
5272d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeEnd());
5282d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayRead(xx, &x));
5292d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
5302d1451d4SHong Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
5312d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
5322d1451d4SHong Zhang }
5332d1451d4SHong Zhang 
5342d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
5352d1451d4SHong Zhang {
5362d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
5372d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
5382d1451d4SHong Zhang   PetscScalar       *z;
5392d1451d4SHong Zhang   const PetscScalar *y, *x;
540a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
5412d1451d4SHong Zhang   MatScalar         *aval    = cudastruct->val;
5422d1451d4SHong Zhang   PetscInt          *acolidx = cudastruct->colidx;
5432d1451d4SHong Zhang   PetscInt          *sliidx  = cudastruct->sliidx;
5442d1451d4SHong Zhang 
5452d1451d4SHong Zhang   PetscFunctionBegin;
5464e58db63SHong Zhang   PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
5472d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
5482d1451d4SHong Zhang   if (a->nz) {
54907e43b41SHong Zhang     PetscInt nblocks, blocksize = 512;
55007e43b41SHong Zhang     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
5512d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(xx, &x));
5522d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(yy, &y));
5532d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayWrite(zz, &z));
5542d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeBegin());
55507e43b41SHong Zhang 
55607e43b41SHong Zhang     switch (cudastruct->kernelchoice) {
557*cca9ff8bSHong Zhang     case 9:
558*cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / sliceheight;
559*cca9ff8bSHong Zhang       if (cudastruct->blocky == 2) {
560*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
561*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 4) {
562*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
563*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 8) {
564*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
565*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 16) {
566*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
567*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 32) {
568*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
569*cca9ff8bSHong Zhang       } else {
570*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
571*cca9ff8bSHong Zhang       }
572*cca9ff8bSHong Zhang       break;
573*cca9ff8bSHong Zhang     case 7:
574*cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
575*cca9ff8bSHong Zhang       if (cudastruct->blocky == 2) {
576*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
577*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 4) {
578*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
579*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 8) {
580*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
581*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 16) {
582*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
583*cca9ff8bSHong Zhang       } else if (cudastruct->blocky == 32) {
584*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
585*cca9ff8bSHong Zhang       } else {
586*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
587*cca9ff8bSHong Zhang       }
588*cca9ff8bSHong Zhang       break;
58907e43b41SHong Zhang     case 6:
59007e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 32);
591a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
59207e43b41SHong Zhang       break;
59307e43b41SHong Zhang     case 5:
59407e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 16);
595a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
59607e43b41SHong Zhang       break;
59707e43b41SHong Zhang     case 4:
59807e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 8);
599a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
60007e43b41SHong Zhang       break;
60107e43b41SHong Zhang     case 3:
60207e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 4);
603a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
60407e43b41SHong Zhang       break;
60507e43b41SHong Zhang     case 2:
60607e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 2);
607a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
60807e43b41SHong Zhang       break;
60907e43b41SHong Zhang     case 1:
61007e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / blocksize;
6114e58db63SHong Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
61207e43b41SHong Zhang       break;
613*cca9ff8bSHong Zhang     case 0:
614*cca9ff8bSHong Zhang       if (sliceheight * a->maxslicewidth > 20800) {
615*cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
616*cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
617*cca9ff8bSHong Zhang       } else {
618*cca9ff8bSHong Zhang         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
619*cca9ff8bSHong Zhang         if (avgslicesize <= 96) {
620*cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
621*cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
622*cca9ff8bSHong Zhang         } else if (avgslicesize <= 432) {
623*cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
624*cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
625*cca9ff8bSHong Zhang         } else if (avgslicesize <= 2400) {
626*cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
627*cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
628*cca9ff8bSHong Zhang         } else {
629*cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
630*cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
631*cca9ff8bSHong Zhang         }
632*cca9ff8bSHong Zhang       }
63307e43b41SHong Zhang       break;
6342d1451d4SHong Zhang     }
6352d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeEnd());
6362d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(xx, &x));
6372d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(yy, &y));
6382d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
6392d1451d4SHong Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
6402d1451d4SHong Zhang   } else {
6412d1451d4SHong Zhang     PetscCall(VecCopy(yy, zz));
6422d1451d4SHong Zhang   }
6432d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6442d1451d4SHong Zhang }
6452d1451d4SHong Zhang 
6462d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
6472d1451d4SHong Zhang {
64807e43b41SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
6494e58db63SHong Zhang   PetscInt         kernel, blocky;
65007e43b41SHong Zhang   PetscBool        flg;
65107e43b41SHong Zhang 
6522d1451d4SHong Zhang   PetscFunctionBegin;
6532d1451d4SHong Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
65407e43b41SHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
65507e43b41SHong Zhang   if (flg) {
65607e43b41SHong Zhang     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
65707e43b41SHong Zhang     cudastruct->kernelchoice = kernel;
65807e43b41SHong Zhang   }
6594e58db63SHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
6604e58db63SHong Zhang   if (flg) {
6614e58db63SHong Zhang     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel);
6624e58db63SHong Zhang     cudastruct->blocky = blocky;
6634e58db63SHong Zhang   }
6642d1451d4SHong Zhang   PetscOptionsHeadEnd();
6652d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6662d1451d4SHong Zhang }
6672d1451d4SHong Zhang 
66807e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
66907e43b41SHong Zhang {
67007e43b41SHong Zhang   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
67107e43b41SHong Zhang 
67207e43b41SHong Zhang   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
67307e43b41SHong Zhang   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
67407e43b41SHong Zhang   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
67507e43b41SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
67607e43b41SHong Zhang }
67707e43b41SHong Zhang 
6782d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
6792d1451d4SHong Zhang {
6802d1451d4SHong Zhang   PetscFunctionBegin;
6812d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
68207e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
6832d1451d4SHong Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
6842d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
6852d1451d4SHong Zhang   A->ops->mult    = MatMult_SeqSELLCUDA;
6862d1451d4SHong Zhang   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
6872d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6882d1451d4SHong Zhang }
6892d1451d4SHong Zhang 
6902d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
6912d1451d4SHong Zhang {
6922d1451d4SHong Zhang   PetscFunctionBegin;
6932d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) {
6942d1451d4SHong Zhang     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
6952d1451d4SHong Zhang   }
6962d1451d4SHong Zhang   PetscCall(MatDestroy_SeqSELL(A));
6972d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6982d1451d4SHong Zhang }
6992d1451d4SHong Zhang 
7002d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
7012d1451d4SHong Zhang static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
7022d1451d4SHong Zhang {
7032d1451d4SHong Zhang   PetscFunctionBegin;
7042d1451d4SHong Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
7052d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
7062d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
7072d1451d4SHong Zhang }
7082d1451d4SHong Zhang 
7092d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
7102d1451d4SHong Zhang {
7112d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct;
7122d1451d4SHong Zhang 
7132d1451d4SHong Zhang   PetscFunctionBegin;
7142d1451d4SHong Zhang   PetscCall(PetscFree(B->defaultvectype));
7152d1451d4SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
7162d1451d4SHong Zhang 
7172d1451d4SHong Zhang   if (!B->spptr) {
7182d1451d4SHong Zhang     if (B->factortype == MAT_FACTOR_NONE) {
7192d1451d4SHong Zhang       PetscCall(PetscNew(&cudastruct));
7202d1451d4SHong Zhang       B->spptr = cudastruct;
7212d1451d4SHong Zhang     }
7222d1451d4SHong Zhang   }
7232d1451d4SHong Zhang 
7242d1451d4SHong Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
7252d1451d4SHong Zhang   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
7262d1451d4SHong Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
7272d1451d4SHong Zhang   B->ops->mult           = MatMult_SeqSELLCUDA;
7282d1451d4SHong Zhang   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
7292d1451d4SHong Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
7302d1451d4SHong Zhang 
73107e43b41SHong Zhang   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
73207e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
73307e43b41SHong Zhang 
7342d1451d4SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
7352d1451d4SHong Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
7362d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
7372d1451d4SHong Zhang }
7382d1451d4SHong Zhang 
7392d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
7402d1451d4SHong Zhang {
7412d1451d4SHong Zhang   PetscFunctionBegin;
7422d1451d4SHong Zhang   PetscCall(MatCreate_SeqSELL(B));
7432d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
7442d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
7452d1451d4SHong Zhang }
746