xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 90d2215b4f5b865a0e7953289703ce9072a42621)
12d1451d4SHong Zhang #include <cuda_runtime.h>
22d1451d4SHong Zhang 
32d1451d4SHong Zhang #include <petscdevice_cuda.h>
42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
52d1451d4SHong Zhang 
607e43b41SHong Zhang #define SLICE_HEIGHT 16
707e43b41SHong Zhang 
82d1451d4SHong Zhang typedef struct {
9*90d2215bSHong Zhang   PetscInt   maxallocmat;
10*90d2215bSHong Zhang   PetscInt   totalentries;
11*90d2215bSHong Zhang   PetscInt  *colidx; /* column index array, device pointer */
12*90d2215bSHong Zhang   MatScalar *val;    /* value array, device pointer */
13*90d2215bSHong Zhang   PetscInt   totalslices;
14*90d2215bSHong Zhang   PetscInt  *sliidx; /* slice index array, device pointer */
152d1451d4SHong Zhang   PetscInt   nonzerostate;
1607e43b41SHong Zhang   PetscInt   kernelchoice;
174e58db63SHong Zhang   PetscInt   blocky;
18*90d2215bSHong Zhang   PetscInt   chunksperblock;
19*90d2215bSHong Zhang   PetscInt   totalchunks;
20*90d2215bSHong Zhang   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
212d1451d4SHong Zhang } Mat_SeqSELLCUDA;
222d1451d4SHong Zhang 
232d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
242d1451d4SHong Zhang {
252d1451d4SHong Zhang   PetscFunctionBegin;
262d1451d4SHong Zhang   if (*cudastruct) {
272d1451d4SHong Zhang     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
282d1451d4SHong Zhang     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
292d1451d4SHong Zhang     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
30*90d2215bSHong Zhang     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
312d1451d4SHong Zhang     PetscCall(PetscFree(*cudastruct));
322d1451d4SHong Zhang   }
332d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
342d1451d4SHong Zhang }
352d1451d4SHong Zhang 
362d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
372d1451d4SHong Zhang {
382d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
392d1451d4SHong Zhang   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
402d1451d4SHong Zhang 
412d1451d4SHong Zhang   PetscFunctionBegin;
422d1451d4SHong Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
432d1451d4SHong Zhang     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
442d1451d4SHong Zhang     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
452d1451d4SHong Zhang       /* copy values only */
462d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
472d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
482d1451d4SHong Zhang     } else {
492d1451d4SHong Zhang       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
502d1451d4SHong Zhang       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
512d1451d4SHong Zhang       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
52*90d2215bSHong Zhang       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
53*90d2215bSHong Zhang       cudastruct->maxallocmat  = a->maxallocmat;
54*90d2215bSHong Zhang       cudastruct->totalentries = a->sliidx[a->totalslices];
55*90d2215bSHong Zhang       cudastruct->totalslices  = a->totalslices;
56*90d2215bSHong Zhang       cudastruct->totalchunks  = a->totalchunks;
572d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
582d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
592d1451d4SHong Zhang       /* copy values, nz or maxallocmat? */
602d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
612d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
6207e43b41SHong Zhang 
6307e43b41SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
6407e43b41SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
65*90d2215bSHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(PetscInt)));
66*90d2215bSHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(PetscInt), cudaMemcpyHostToDevice));
67*90d2215bSHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
682d1451d4SHong Zhang     }
692d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
702d1451d4SHong Zhang     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
712d1451d4SHong Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
722d1451d4SHong Zhang   }
732d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
742d1451d4SHong Zhang }
752d1451d4SHong Zhang 
764e58db63SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
772d1451d4SHong Zhang {
782d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
792d1451d4SHong Zhang   MatScalar sum;
802d1451d4SHong Zhang   /* one thread per row. */
812d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
822d1451d4SHong Zhang   if (row < nrows) {
834e58db63SHong Zhang     slice_id     = row / sliceheight;
844e58db63SHong Zhang     row_in_slice = row % sliceheight;
852d1451d4SHong Zhang     sum          = 0.0;
864e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
872d1451d4SHong Zhang     y[row] = sum;
882d1451d4SHong Zhang   }
892d1451d4SHong Zhang }
902d1451d4SHong Zhang 
914e58db63SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
922d1451d4SHong Zhang {
932d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
942d1451d4SHong Zhang   MatScalar sum;
952d1451d4SHong Zhang   /* one thread per row. */
962d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
972d1451d4SHong Zhang   if (row < nrows) {
984e58db63SHong Zhang     slice_id     = row / sliceheight;
994e58db63SHong Zhang     row_in_slice = row % sliceheight;
1002d1451d4SHong Zhang     sum          = 0.0;
1014e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
1022d1451d4SHong Zhang     z[row] = y[row] + sum;
1032d1451d4SHong Zhang   }
1042d1451d4SHong Zhang }
10507e43b41SHong Zhang 
10607e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */
10707e43b41SHong Zhang template <int BLOCKY>
1084e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
10907e43b41SHong Zhang {
1104e58db63SHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
1114e58db63SHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
1124e58db63SHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
11307e43b41SHong Zhang   /* transposed index */
11407e43b41SHong Zhang   int         tidx = tid % BLOCKY;
11507e43b41SHong Zhang   int         tidy = tid / BLOCKY;
11607e43b41SHong Zhang   PetscScalar t    = 0.0;
1174e58db63SHong Zhang 
1184e58db63SHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
11907e43b41SHong Zhang   if (row < nrows) {
1204e58db63SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
1212d1451d4SHong Zhang   }
1224e58db63SHong Zhang #pragma unroll
1234e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
12407e43b41SHong Zhang   /* transpose layout to reduce each row using warp shfl */
1251f0d1278SHong Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
12607e43b41SHong Zhang   __syncthreads();
1271f0d1278SHong Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
12807e43b41SHong Zhang #pragma unroll
12907e43b41SHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
1304e58db63SHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
13107e43b41SHong Zhang   __syncthreads();
1324e58db63SHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
1332d1451d4SHong Zhang }
1342d1451d4SHong Zhang 
135cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */
136cca9ff8bSHong Zhang template <int BLOCKY>
137cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138cca9ff8bSHong Zhang {
139cca9ff8bSHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
140cca9ff8bSHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
141cca9ff8bSHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
142cca9ff8bSHong Zhang   /* transposed index */
143cca9ff8bSHong Zhang   int         tidx = tid % BLOCKY;
144cca9ff8bSHong Zhang   int         tidy = tid / BLOCKY;
145cca9ff8bSHong Zhang   PetscScalar t    = 0.0;
146cca9ff8bSHong Zhang 
147cca9ff8bSHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
148cca9ff8bSHong Zhang   if (row < nrows) {
149cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150cca9ff8bSHong Zhang   }
151cca9ff8bSHong Zhang #pragma unroll
152cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
153cca9ff8bSHong Zhang   /* transpose layout to reduce each row using warp shfl */
1541f0d1278SHong Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155cca9ff8bSHong Zhang   __syncthreads();
1561f0d1278SHong Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
157cca9ff8bSHong Zhang #pragma unroll
158cca9ff8bSHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
159cca9ff8bSHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
160cca9ff8bSHong Zhang   __syncthreads();
161cca9ff8bSHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
162cca9ff8bSHong Zhang }
163cca9ff8bSHong Zhang 
164*90d2215bSHong Zhang template <int BLOCKY>
165*90d2215bSHong Zhang __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166*90d2215bSHong Zhang {
167*90d2215bSHong Zhang   bool head = true;
168*90d2215bSHong Zhang #pragma unroll
169*90d2215bSHong Zhang   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170*90d2215bSHong Zhang     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
171*90d2215bSHong Zhang     shared[threadIdx.x + threadIdx.y * 32] = 0;
172*90d2215bSHong Zhang     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173*90d2215bSHong Zhang       shared[threadIdx.x + threadIdx.y * 32] = *val;
174*90d2215bSHong Zhang       if (i == 1) head = false;
175*90d2215bSHong Zhang     }
176*90d2215bSHong Zhang     __syncthreads();
177*90d2215bSHong Zhang     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178*90d2215bSHong Zhang     __syncthreads();
179*90d2215bSHong Zhang   }
180*90d2215bSHong Zhang   return head;
181*90d2215bSHong Zhang }
182*90d2215bSHong Zhang 
183*90d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
184*90d2215bSHong Zhang template <int BLOCKY>
185*90d2215bSHong Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186*90d2215bSHong Zhang {
187*90d2215bSHong Zhang   __shared__ MatScalar shared[BLOCKY * 32];
188*90d2215bSHong Zhang   PetscInt             gid, row, start_slice, cid;
189*90d2215bSHong Zhang   PetscScalar          t = 0.0;
190*90d2215bSHong Zhang   /* zero out y */
191*90d2215bSHong Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
192*90d2215bSHong Zhang     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
193*90d2215bSHong Zhang     if (gid < nrows) y[gid] = 0.0;
194*90d2215bSHong Zhang   }
195*90d2215bSHong Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
196*90d2215bSHong Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
197*90d2215bSHong Zhang     if (cid < totalchunks) {
198*90d2215bSHong Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
199*90d2215bSHong Zhang       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
200*90d2215bSHong Zhang       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
201*90d2215bSHong Zhang         __shared__ PetscInt flag[BLOCKY * 2];
202*90d2215bSHong Zhang         bool                write;
203*90d2215bSHong Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
204*90d2215bSHong Zhang         /* find out the slice that this element belongs to */
205*90d2215bSHong Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
206*90d2215bSHong Zhang         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
207*90d2215bSHong Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
208*90d2215bSHong Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
209*90d2215bSHong Zhang         __syncthreads();
210*90d2215bSHong Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
211*90d2215bSHong Zhang         if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t);
212*90d2215bSHong Zhang         t = 0.0;
213*90d2215bSHong Zhang       } else { /* this iteration covers only one slice */
214*90d2215bSHong Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
215*90d2215bSHong Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
216*90d2215bSHong Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
217*90d2215bSHong Zhang           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
218*90d2215bSHong Zhang /* reduction and write to output vector */
219*90d2215bSHong Zhang #pragma unroll
220*90d2215bSHong Zhang           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
221*90d2215bSHong Zhang           /* transpose layout to reduce each row using warp shfl */
222*90d2215bSHong Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
223*90d2215bSHong Zhang           __syncthreads();
224*90d2215bSHong Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
225*90d2215bSHong Zhang #pragma unroll
226*90d2215bSHong Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
227*90d2215bSHong Zhang           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
228*90d2215bSHong Zhang           __syncthreads();
229*90d2215bSHong Zhang           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
230*90d2215bSHong Zhang           t = 0.0;
231*90d2215bSHong Zhang         }
232*90d2215bSHong Zhang       }
233*90d2215bSHong Zhang     }
234*90d2215bSHong Zhang   }
235*90d2215bSHong Zhang }
236*90d2215bSHong Zhang 
237*90d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
238*90d2215bSHong Zhang template <int BLOCKY>
239*90d2215bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
240*90d2215bSHong Zhang {
241*90d2215bSHong Zhang   __shared__ MatScalar shared[BLOCKY * 32];
242*90d2215bSHong Zhang   PetscInt             gid, row, start_slice, cid;
243*90d2215bSHong Zhang   PetscScalar          t = 0.0;
244*90d2215bSHong Zhang   /* copy y to z */
245*90d2215bSHong Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
246*90d2215bSHong Zhang     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
247*90d2215bSHong Zhang     if (gid < nrows) z[gid] = y[gid];
248*90d2215bSHong Zhang   }
249*90d2215bSHong Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
250*90d2215bSHong Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
251*90d2215bSHong Zhang     if (cid < totalchunks) {
252*90d2215bSHong Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
253*90d2215bSHong Zhang       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
254*90d2215bSHong Zhang       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
255*90d2215bSHong Zhang         __shared__ PetscInt flag[BLOCKY * 2];
256*90d2215bSHong Zhang         bool                write;
257*90d2215bSHong Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
258*90d2215bSHong Zhang         /* find out the slice that this element belongs to */
259*90d2215bSHong Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
260*90d2215bSHong Zhang         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
261*90d2215bSHong Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
262*90d2215bSHong Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
263*90d2215bSHong Zhang         __syncthreads();
264*90d2215bSHong Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
265*90d2215bSHong Zhang         if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t);
266*90d2215bSHong Zhang         t = 0.0;
267*90d2215bSHong Zhang       } else { /* this iteration covers only one slice */
268*90d2215bSHong Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
269*90d2215bSHong Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
270*90d2215bSHong Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
271*90d2215bSHong Zhang           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
272*90d2215bSHong Zhang /* reduction and write to output vector */
273*90d2215bSHong Zhang #pragma unroll
274*90d2215bSHong Zhang           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
275*90d2215bSHong Zhang           /* transpose layout to reduce each row using warp shfl */
276*90d2215bSHong Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
277*90d2215bSHong Zhang           __syncthreads();
278*90d2215bSHong Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
279*90d2215bSHong Zhang #pragma unroll
280*90d2215bSHong Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
281*90d2215bSHong Zhang           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
282*90d2215bSHong Zhang           __syncthreads();
283*90d2215bSHong Zhang           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
284*90d2215bSHong Zhang           t = 0.0;
285*90d2215bSHong Zhang         }
286*90d2215bSHong Zhang       }
287*90d2215bSHong Zhang     }
288*90d2215bSHong Zhang   }
289*90d2215bSHong Zhang }
290*90d2215bSHong Zhang 
29107e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */
2924e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
2932d1451d4SHong Zhang {
29407e43b41SHong Zhang   PetscInt i, row, slice_id;
29507e43b41SHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
2964e58db63SHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
29707e43b41SHong Zhang   double t = 0.0;
29807e43b41SHong Zhang   if (row < nrows) {
29907e43b41SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
30007e43b41SHong Zhang   }
3014e58db63SHong Zhang #pragma unroll
3024e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
3034e58db63SHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
30407e43b41SHong Zhang }
30507e43b41SHong Zhang 
306cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */
307cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
308cca9ff8bSHong Zhang {
309cca9ff8bSHong Zhang   PetscInt i, row, slice_id;
310cca9ff8bSHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
311cca9ff8bSHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
312cca9ff8bSHong Zhang   double t = 0.0;
313cca9ff8bSHong Zhang   if (row < nrows) {
314cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
315cca9ff8bSHong Zhang   }
316cca9ff8bSHong Zhang #pragma unroll
317cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
318cca9ff8bSHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
319cca9ff8bSHong Zhang }
320cca9ff8bSHong Zhang 
321a9dd396cSHong Zhang /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
322a9dd396cSHong Zhang 
323a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
32407e43b41SHong Zhang {
32507e43b41SHong Zhang   __shared__ MatScalar shared[512];
3262d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
32707e43b41SHong Zhang   /* multiple threads per row. */
3282d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
3292d1451d4SHong Zhang   if (row < nrows) {
3302d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
3312d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
3322d1451d4SHong Zhang 
3332d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3342d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
33507e43b41SHong Zhang     __syncthreads();
33607e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
33707e43b41SHong Zhang     __syncthreads();
33807e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
3392d1451d4SHong Zhang     __syncthreads();
3402d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
3412d1451d4SHong Zhang     __syncthreads();
3422d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
3432d1451d4SHong Zhang     __syncthreads();
3442d1451d4SHong Zhang     if (threadIdx.y < 1) {
34507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
3462d1451d4SHong Zhang       y[row] = shared[threadIdx.x];
3472d1451d4SHong Zhang     }
3482d1451d4SHong Zhang   }
3492d1451d4SHong Zhang }
3502d1451d4SHong Zhang 
351a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
3522d1451d4SHong Zhang {
35307e43b41SHong Zhang   __shared__ MatScalar shared[512];
3542d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
35507e43b41SHong Zhang   /* multiple threads per row. */
3562d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
3572d1451d4SHong Zhang   if (row < nrows) {
3582d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
3592d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
3602d1451d4SHong Zhang 
3612d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3622d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
36307e43b41SHong Zhang     __syncthreads();
36407e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
3652d1451d4SHong Zhang     __syncthreads();
3662d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
3672d1451d4SHong Zhang     __syncthreads();
3682d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
3692d1451d4SHong Zhang     __syncthreads();
3702d1451d4SHong Zhang     if (threadIdx.y < 1) {
37107e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
37207e43b41SHong Zhang       y[row] = shared[threadIdx.x];
37307e43b41SHong Zhang     }
37407e43b41SHong Zhang   }
37507e43b41SHong Zhang }
37607e43b41SHong Zhang 
377a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
37807e43b41SHong Zhang {
37907e43b41SHong Zhang   __shared__ MatScalar shared[512];
38007e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
38107e43b41SHong Zhang   /* multiple threads per row. */
38207e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
38307e43b41SHong Zhang   if (row < nrows) {
38407e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
38507e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
38607e43b41SHong Zhang 
38707e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
38807e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
38907e43b41SHong Zhang     __syncthreads();
39007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
39107e43b41SHong Zhang     __syncthreads();
39207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
39307e43b41SHong Zhang     __syncthreads();
39407e43b41SHong Zhang     if (threadIdx.y < 1) {
39507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
39607e43b41SHong Zhang       y[row] = shared[threadIdx.x];
39707e43b41SHong Zhang     }
39807e43b41SHong Zhang   }
39907e43b41SHong Zhang }
40007e43b41SHong Zhang 
401a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
40207e43b41SHong Zhang {
40307e43b41SHong Zhang   __shared__ MatScalar shared[512];
40407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
40507e43b41SHong Zhang   /* multiple threads per row. */
40607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
40707e43b41SHong Zhang   if (row < nrows) {
40807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
40907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
41007e43b41SHong Zhang 
41107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
41207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
41307e43b41SHong Zhang     __syncthreads();
41407e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
41507e43b41SHong Zhang     __syncthreads();
41607e43b41SHong Zhang     if (threadIdx.y < 1) {
41707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
41807e43b41SHong Zhang       y[row] = shared[threadIdx.x];
41907e43b41SHong Zhang     }
42007e43b41SHong Zhang   }
42107e43b41SHong Zhang }
42207e43b41SHong Zhang 
423a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
42407e43b41SHong Zhang {
42507e43b41SHong Zhang   __shared__ MatScalar shared[512];
42607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
42707e43b41SHong Zhang   /* multiple threads per row. */
42807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
42907e43b41SHong Zhang   if (row < nrows) {
43007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
43107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
43207e43b41SHong Zhang 
43307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
43407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
43507e43b41SHong Zhang     __syncthreads();
43607e43b41SHong Zhang     if (threadIdx.y < 1) {
43707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
43807e43b41SHong Zhang       y[row] = shared[threadIdx.x];
43907e43b41SHong Zhang     }
44007e43b41SHong Zhang   }
44107e43b41SHong Zhang }
44207e43b41SHong Zhang 
443a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
44407e43b41SHong Zhang {
44507e43b41SHong Zhang   __shared__ MatScalar shared[512];
44607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
44707e43b41SHong Zhang   /* multiple threads per row. */
44807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
44907e43b41SHong Zhang   if (row < nrows) {
45007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
45107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
45207e43b41SHong Zhang 
45307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
45407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
45507e43b41SHong Zhang     __syncthreads();
45607e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
45707e43b41SHong Zhang     __syncthreads();
45807e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
45907e43b41SHong Zhang     __syncthreads();
46007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
46107e43b41SHong Zhang     __syncthreads();
46207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
46307e43b41SHong Zhang     __syncthreads();
46407e43b41SHong Zhang     if (threadIdx.y < 1) {
46507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
4662d1451d4SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
4672d1451d4SHong Zhang     }
4682d1451d4SHong Zhang   }
4692d1451d4SHong Zhang }
47007e43b41SHong Zhang 
471a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
47207e43b41SHong Zhang {
47307e43b41SHong Zhang   __shared__ MatScalar shared[512];
47407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
47507e43b41SHong Zhang   /* multiple threads per row. */
47607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
47707e43b41SHong Zhang   if (row < nrows) {
47807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
47907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
48007e43b41SHong Zhang 
48107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
48207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
48307e43b41SHong Zhang     __syncthreads();
48407e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
48507e43b41SHong Zhang     __syncthreads();
48607e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
48707e43b41SHong Zhang     __syncthreads();
48807e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
48907e43b41SHong Zhang     __syncthreads();
49007e43b41SHong Zhang     if (threadIdx.y < 1) {
49107e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
49207e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
49307e43b41SHong Zhang     }
49407e43b41SHong Zhang   }
49507e43b41SHong Zhang }
49607e43b41SHong Zhang 
497a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
49807e43b41SHong Zhang {
49907e43b41SHong Zhang   __shared__ MatScalar shared[512];
50007e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
50107e43b41SHong Zhang   /* multiple threads per row. */
50207e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
50307e43b41SHong Zhang   if (row < nrows) {
50407e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
50507e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
50607e43b41SHong Zhang 
50707e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
50807e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
50907e43b41SHong Zhang     __syncthreads();
51007e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
51107e43b41SHong Zhang     __syncthreads();
51207e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
51307e43b41SHong Zhang     __syncthreads();
51407e43b41SHong Zhang     if (threadIdx.y < 1) {
51507e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
51607e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
51707e43b41SHong Zhang     }
51807e43b41SHong Zhang   }
51907e43b41SHong Zhang }
52007e43b41SHong Zhang 
521a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
52207e43b41SHong Zhang {
52307e43b41SHong Zhang   __shared__ MatScalar shared[512];
52407e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
52507e43b41SHong Zhang   /* multiple threads per row. */
52607e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
52707e43b41SHong Zhang   if (row < nrows) {
52807e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
52907e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
53007e43b41SHong Zhang 
53107e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
53207e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
53307e43b41SHong Zhang     __syncthreads();
53407e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
53507e43b41SHong Zhang     __syncthreads();
53607e43b41SHong Zhang     if (threadIdx.y < 1) {
53707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
53807e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
53907e43b41SHong Zhang     }
54007e43b41SHong Zhang   }
54107e43b41SHong Zhang }
54207e43b41SHong Zhang 
543a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
54407e43b41SHong Zhang {
54507e43b41SHong Zhang   __shared__ MatScalar shared[512];
54607e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
54707e43b41SHong Zhang   /* multiple threads per row. */
54807e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
54907e43b41SHong Zhang   if (row < nrows) {
55007e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
55107e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
55207e43b41SHong Zhang 
55307e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
55407e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
55507e43b41SHong Zhang     __syncthreads();
55607e43b41SHong Zhang     if (threadIdx.y < 1) {
55707e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
55807e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
55907e43b41SHong Zhang     }
56007e43b41SHong Zhang   }
5612d1451d4SHong Zhang }
5622d1451d4SHong Zhang 
5632d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
5642d1451d4SHong Zhang {
5652d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
5662d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
5672d1451d4SHong Zhang   PetscScalar       *y;
5682d1451d4SHong Zhang   const PetscScalar *x;
569a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
570*90d2215bSHong Zhang   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
5712d1451d4SHong Zhang   MatScalar         *aval;
5722d1451d4SHong Zhang   PetscInt          *acolidx;
5732d1451d4SHong Zhang   PetscInt          *sliidx;
57407e43b41SHong Zhang   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
57507e43b41SHong Zhang   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
5762d1451d4SHong Zhang 
5772d1451d4SHong Zhang   PetscFunctionBegin;
5784e58db63SHong Zhang   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
579*90d2215bSHong Zhang   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
5802d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
5812d1451d4SHong Zhang   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
5822d1451d4SHong Zhang   aval    = cudastruct->val;
5832d1451d4SHong Zhang   acolidx = cudastruct->colidx;
5842d1451d4SHong Zhang   sliidx  = cudastruct->sliidx;
5852d1451d4SHong Zhang 
5862d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayRead(xx, &x));
5872d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayWrite(yy, &y));
5882d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeBegin());
58907e43b41SHong Zhang 
59007e43b41SHong Zhang   switch (cudastruct->kernelchoice) {
59107e43b41SHong Zhang   case 9:
5924e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / sliceheight;
5934e58db63SHong Zhang     if (cudastruct->blocky == 2) {
5944e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
5954e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
5964e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
5974e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
5984e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
5994e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
6004e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6014e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
6024e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
603cca9ff8bSHong Zhang     } else {
604cca9ff8bSHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
60507e43b41SHong Zhang     }
60607e43b41SHong Zhang     break;
60707e43b41SHong Zhang   case 7:
6084e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
6094e58db63SHong Zhang     if (cudastruct->blocky == 2) {
6104e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6114e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
6124e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6134e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
6144e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6154e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
6164e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6174e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
6184e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6194e58db63SHong Zhang     } else {
6204e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6214e58db63SHong Zhang     }
62207e43b41SHong Zhang     break;
62307e43b41SHong Zhang   case 6:
62407e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
625a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
62607e43b41SHong Zhang     break;
62707e43b41SHong Zhang   case 5:
62807e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
629a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
63007e43b41SHong Zhang     break;
63107e43b41SHong Zhang   case 4:
63207e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
633a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
63407e43b41SHong Zhang     break;
63507e43b41SHong Zhang   case 3:
63607e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
637a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
63807e43b41SHong Zhang     break;
63907e43b41SHong Zhang   case 2: /* 16 slices per block if blocksize=512 */
64007e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 2);
641a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
64207e43b41SHong Zhang     break;
64307e43b41SHong Zhang   case 1: /* 32 slices per block if blocksize=512 */
64407e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / blocksize;
6454e58db63SHong Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
64607e43b41SHong Zhang     break;
64707e43b41SHong Zhang   case 0:
648*90d2215bSHong Zhang     maxoveravg = a->maxslicewidth / a->avgslicewidth;
649*90d2215bSHong Zhang     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
650*90d2215bSHong Zhang       /* each block handles approximately one slice */
651*90d2215bSHong Zhang       PetscInt blocky = a->chunksize / 32;
652*90d2215bSHong Zhang       nchunks         = cudastruct->totalchunks;
653*90d2215bSHong Zhang       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
654*90d2215bSHong Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
655*90d2215bSHong Zhang       chunk_slice_map = cudastruct->chunk_slice_map;
656*90d2215bSHong Zhang       if (blocky == 2) {
657*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
658*90d2215bSHong Zhang       } else if (blocky == 4) {
659*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
660*90d2215bSHong Zhang       } else if (blocky == 8) {
661*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
662*90d2215bSHong Zhang       } else if (blocky == 16) {
663*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
664*90d2215bSHong Zhang       } else if (blocky == 32) {
665*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
666*90d2215bSHong Zhang       } else {
667*90d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
668*90d2215bSHong Zhang       }
6692d1451d4SHong Zhang     } else {
670cca9ff8bSHong Zhang       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
671*90d2215bSHong Zhang       if (avgslicesize <= 432) {
672*90d2215bSHong Zhang         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
673cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
674cca9ff8bSHong Zhang           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
675*90d2215bSHong Zhang         } else {
676cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
677cca9ff8bSHong Zhang           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
678*90d2215bSHong Zhang         }
679cca9ff8bSHong Zhang       } else if (avgslicesize <= 2400) {
680cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
681cca9ff8bSHong Zhang         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
682cca9ff8bSHong Zhang       } else {
6834e58db63SHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
6844e58db63SHong Zhang         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6852d1451d4SHong Zhang       }
6862d1451d4SHong Zhang     }
687cca9ff8bSHong Zhang     break;
688cca9ff8bSHong Zhang   }
6892d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeEnd());
6902d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayRead(xx, &x));
6912d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
6922d1451d4SHong Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
6932d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
6942d1451d4SHong Zhang }
6952d1451d4SHong Zhang 
6962d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
6972d1451d4SHong Zhang {
6982d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
6992d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
7002d1451d4SHong Zhang   PetscScalar       *z;
7012d1451d4SHong Zhang   const PetscScalar *y, *x;
702a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
703*90d2215bSHong Zhang   PetscInt           chunksperblock, nchunks, *chunk_slice_map;
7042d1451d4SHong Zhang   MatScalar         *aval    = cudastruct->val;
7052d1451d4SHong Zhang   PetscInt          *acolidx = cudastruct->colidx;
7062d1451d4SHong Zhang   PetscInt          *sliidx  = cudastruct->sliidx;
707*90d2215bSHong Zhang   PetscReal          maxoveravg;
7082d1451d4SHong Zhang 
7092d1451d4SHong Zhang   PetscFunctionBegin;
710*90d2215bSHong Zhang   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
711*90d2215bSHong Zhang   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
7122d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
7132d1451d4SHong Zhang   if (a->nz) {
714*90d2215bSHong Zhang     PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512;
71507e43b41SHong Zhang     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
7162d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(xx, &x));
7172d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(yy, &y));
7182d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayWrite(zz, &z));
7192d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeBegin());
72007e43b41SHong Zhang 
72107e43b41SHong Zhang     switch (cudastruct->kernelchoice) {
722cca9ff8bSHong Zhang     case 9:
723cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / sliceheight;
724*90d2215bSHong Zhang       if (blocky == 2) {
725cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
726*90d2215bSHong Zhang       } else if (blocky == 4) {
727cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
728*90d2215bSHong Zhang       } else if (blocky == 8) {
729cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
730*90d2215bSHong Zhang       } else if (blocky == 16) {
731cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
732*90d2215bSHong Zhang       } else if (blocky == 32) {
733cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
734cca9ff8bSHong Zhang       } else {
735cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
736cca9ff8bSHong Zhang       }
737cca9ff8bSHong Zhang       break;
738*90d2215bSHong Zhang     case 8:
739*90d2215bSHong Zhang       /* each block handles approximately one slice */
740*90d2215bSHong Zhang       nchunks         = cudastruct->totalchunks;
741*90d2215bSHong Zhang       blocky          = a->chunksize / 32;
742*90d2215bSHong Zhang       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
743*90d2215bSHong Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
744*90d2215bSHong Zhang       chunk_slice_map = cudastruct->chunk_slice_map;
745*90d2215bSHong Zhang       if (blocky == 2) {
746*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
747*90d2215bSHong Zhang       } else if (blocky == 4) {
748*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
749*90d2215bSHong Zhang       } else if (blocky == 8) {
750*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
751*90d2215bSHong Zhang       } else if (blocky == 16) {
752*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
753*90d2215bSHong Zhang       } else if (blocky == 32) {
754*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
755*90d2215bSHong Zhang       } else {
756*90d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
757*90d2215bSHong Zhang       }
758*90d2215bSHong Zhang       break;
759cca9ff8bSHong Zhang     case 7:
760cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
761*90d2215bSHong Zhang       if (blocky == 2) {
762cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
763*90d2215bSHong Zhang       } else if (blocky == 4) {
764cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
765*90d2215bSHong Zhang       } else if (blocky == 8) {
766cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
767*90d2215bSHong Zhang       } else if (blocky == 16) {
768cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
769*90d2215bSHong Zhang       } else if (blocky == 32) {
770cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
771cca9ff8bSHong Zhang       } else {
772cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
773cca9ff8bSHong Zhang       }
774cca9ff8bSHong Zhang       break;
77507e43b41SHong Zhang     case 6:
77607e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 32);
777a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
77807e43b41SHong Zhang       break;
77907e43b41SHong Zhang     case 5:
78007e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 16);
781a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
78207e43b41SHong Zhang       break;
78307e43b41SHong Zhang     case 4:
78407e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 8);
785a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
78607e43b41SHong Zhang       break;
78707e43b41SHong Zhang     case 3:
78807e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 4);
789a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
79007e43b41SHong Zhang       break;
79107e43b41SHong Zhang     case 2:
79207e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 2);
793a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
79407e43b41SHong Zhang       break;
79507e43b41SHong Zhang     case 1:
79607e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / blocksize;
7974e58db63SHong Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
79807e43b41SHong Zhang       break;
799cca9ff8bSHong Zhang     case 0:
800*90d2215bSHong Zhang       maxoveravg = a->maxslicewidth / a->avgslicewidth;
801*90d2215bSHong Zhang       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
802*90d2215bSHong Zhang         /* each block handles approximately one slice */
803*90d2215bSHong Zhang         nchunks         = cudastruct->totalchunks;
804*90d2215bSHong Zhang         blocky          = a->chunksize / 32;
805*90d2215bSHong Zhang         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
806*90d2215bSHong Zhang         nblocks         = 1 + (nchunks - 1) / chunksperblock;
807*90d2215bSHong Zhang         chunk_slice_map = cudastruct->chunk_slice_map;
808*90d2215bSHong Zhang         if (blocky == 2) {
809*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
810*90d2215bSHong Zhang         } else if (blocky == 4) {
811*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
812*90d2215bSHong Zhang         } else if (blocky == 8) {
813*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
814*90d2215bSHong Zhang         } else if (blocky == 16) {
815*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
816*90d2215bSHong Zhang         } else if (blocky == 32) {
817*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
818*90d2215bSHong Zhang         } else {
819*90d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
820*90d2215bSHong Zhang         }
821cca9ff8bSHong Zhang       } else {
822cca9ff8bSHong Zhang         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
823*90d2215bSHong Zhang         if (avgslicesize <= 432) {
824*90d2215bSHong Zhang           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
825cca9ff8bSHong Zhang             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
826cca9ff8bSHong Zhang             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
827*90d2215bSHong Zhang           } else {
828cca9ff8bSHong Zhang             nblocks = 1 + (nrows - 1) / sliceheight;
829cca9ff8bSHong Zhang             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
830*90d2215bSHong Zhang           }
831cca9ff8bSHong Zhang         } else if (avgslicesize <= 2400) {
832cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
833cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
834cca9ff8bSHong Zhang         } else {
835cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
836cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
837cca9ff8bSHong Zhang         }
838cca9ff8bSHong Zhang       }
83907e43b41SHong Zhang       break;
8402d1451d4SHong Zhang     }
8412d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeEnd());
8422d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(xx, &x));
8432d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(yy, &y));
8442d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
8452d1451d4SHong Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
8462d1451d4SHong Zhang   } else {
8472d1451d4SHong Zhang     PetscCall(VecCopy(yy, zz));
8482d1451d4SHong Zhang   }
8492d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
8502d1451d4SHong Zhang }
8512d1451d4SHong Zhang 
8522d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
8532d1451d4SHong Zhang {
85407e43b41SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
8554e58db63SHong Zhang   PetscInt         kernel, blocky;
85607e43b41SHong Zhang   PetscBool        flg;
85707e43b41SHong Zhang 
8582d1451d4SHong Zhang   PetscFunctionBegin;
8592d1451d4SHong Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
860*90d2215bSHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
861*90d2215bSHong Zhang   if (flg) {
862*90d2215bSHong Zhang     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
863*90d2215bSHong Zhang     cudastruct->blocky = blocky;
864*90d2215bSHong Zhang   }
86507e43b41SHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
86607e43b41SHong Zhang   if (flg) {
86707e43b41SHong Zhang     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
86807e43b41SHong Zhang     cudastruct->kernelchoice = kernel;
869*90d2215bSHong Zhang     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
8704e58db63SHong Zhang   }
8712d1451d4SHong Zhang   PetscOptionsHeadEnd();
8722d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
8732d1451d4SHong Zhang }
8742d1451d4SHong Zhang 
87507e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
87607e43b41SHong Zhang {
87707e43b41SHong Zhang   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
87807e43b41SHong Zhang 
879*90d2215bSHong Zhang   PetscFunctionBegin;
88007e43b41SHong Zhang   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
88107e43b41SHong Zhang   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
88207e43b41SHong Zhang   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
88307e43b41SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
88407e43b41SHong Zhang }
88507e43b41SHong Zhang 
8862d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
8872d1451d4SHong Zhang {
8882d1451d4SHong Zhang   PetscFunctionBegin;
8892d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
89007e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
8912d1451d4SHong Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
8922d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
8932d1451d4SHong Zhang   A->ops->mult    = MatMult_SeqSELLCUDA;
8942d1451d4SHong Zhang   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
8952d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
8962d1451d4SHong Zhang }
8972d1451d4SHong Zhang 
8982d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
8992d1451d4SHong Zhang {
9002d1451d4SHong Zhang   PetscFunctionBegin;
9012d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) {
9022d1451d4SHong Zhang     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
9032d1451d4SHong Zhang   }
9042d1451d4SHong Zhang   PetscCall(MatDestroy_SeqSELL(A));
9052d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9062d1451d4SHong Zhang }
9072d1451d4SHong Zhang 
9082d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
9092d1451d4SHong Zhang static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
9102d1451d4SHong Zhang {
9112d1451d4SHong Zhang   PetscFunctionBegin;
9122d1451d4SHong Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
9132d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
9142d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9152d1451d4SHong Zhang }
9162d1451d4SHong Zhang 
9172d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
9182d1451d4SHong Zhang {
9192d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct;
9202d1451d4SHong Zhang 
9212d1451d4SHong Zhang   PetscFunctionBegin;
9222d1451d4SHong Zhang   PetscCall(PetscFree(B->defaultvectype));
9232d1451d4SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
9242d1451d4SHong Zhang 
9252d1451d4SHong Zhang   if (!B->spptr) {
9262d1451d4SHong Zhang     if (B->factortype == MAT_FACTOR_NONE) {
9272d1451d4SHong Zhang       PetscCall(PetscNew(&cudastruct));
9282d1451d4SHong Zhang       B->spptr = cudastruct;
9292d1451d4SHong Zhang     }
9302d1451d4SHong Zhang   }
9312d1451d4SHong Zhang 
9322d1451d4SHong Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
9332d1451d4SHong Zhang   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
9342d1451d4SHong Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
9352d1451d4SHong Zhang   B->ops->mult           = MatMult_SeqSELLCUDA;
9362d1451d4SHong Zhang   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
9372d1451d4SHong Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
9382d1451d4SHong Zhang 
93907e43b41SHong Zhang   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
94007e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
94107e43b41SHong Zhang 
9422d1451d4SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
9432d1451d4SHong Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
9442d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9452d1451d4SHong Zhang }
9462d1451d4SHong Zhang 
9472d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
9482d1451d4SHong Zhang {
9492d1451d4SHong Zhang   PetscFunctionBegin;
9502d1451d4SHong Zhang   PetscCall(MatCreate_SeqSELL(B));
9512d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
9522d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9532d1451d4SHong Zhang }
954