xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision f4f49eeac7efa77fffa46b7ff95a3ed169f659ed)
12d1451d4SHong Zhang #include <cuda_runtime.h>
22d1451d4SHong Zhang 
32d1451d4SHong Zhang #include <petscdevice_cuda.h>
46eb97cccSStefano Zampini #include <petsc/private/cupmatomics.hpp>
52d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
62d1451d4SHong Zhang 
707e43b41SHong Zhang #define SLICE_HEIGHT 16
807e43b41SHong Zhang 
92d1451d4SHong Zhang typedef struct {
1090d2215bSHong Zhang   PetscInt   maxallocmat;
1190d2215bSHong Zhang   PetscInt   totalentries;
1290d2215bSHong Zhang   PetscInt  *colidx; /* column index array, device pointer */
1390d2215bSHong Zhang   MatScalar *val;    /* value array, device pointer */
1490d2215bSHong Zhang   PetscInt   totalslices;
1590d2215bSHong Zhang   PetscInt  *sliidx; /* slice index array, device pointer */
162d1451d4SHong Zhang   PetscInt   nonzerostate;
1707e43b41SHong Zhang   PetscInt   kernelchoice;
184e58db63SHong Zhang   PetscInt   blocky;
1990d2215bSHong Zhang   PetscInt   chunksperblock;
2090d2215bSHong Zhang   PetscInt   totalchunks;
2190d2215bSHong Zhang   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
222d1451d4SHong Zhang } Mat_SeqSELLCUDA;
232d1451d4SHong Zhang 
242d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
252d1451d4SHong Zhang {
262d1451d4SHong Zhang   PetscFunctionBegin;
272d1451d4SHong Zhang   if (*cudastruct) {
282d1451d4SHong Zhang     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
292d1451d4SHong Zhang     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
302d1451d4SHong Zhang     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
3190d2215bSHong Zhang     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
322d1451d4SHong Zhang     PetscCall(PetscFree(*cudastruct));
332d1451d4SHong Zhang   }
342d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
352d1451d4SHong Zhang }
362d1451d4SHong Zhang 
372d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
382d1451d4SHong Zhang {
392d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
402d1451d4SHong Zhang   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
412d1451d4SHong Zhang 
422d1451d4SHong Zhang   PetscFunctionBegin;
432d1451d4SHong Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
442d1451d4SHong Zhang     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
452d1451d4SHong Zhang     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
462d1451d4SHong Zhang       /* copy values only */
472d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
482d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
492d1451d4SHong Zhang     } else {
502d1451d4SHong Zhang       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
512d1451d4SHong Zhang       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
522d1451d4SHong Zhang       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
5390d2215bSHong Zhang       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
5490d2215bSHong Zhang       cudastruct->maxallocmat  = a->maxallocmat;
5590d2215bSHong Zhang       cudastruct->totalentries = a->sliidx[a->totalslices];
5690d2215bSHong Zhang       cudastruct->totalslices  = a->totalslices;
5790d2215bSHong Zhang       cudastruct->totalchunks  = a->totalchunks;
58*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
59*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
602d1451d4SHong Zhang       /* copy values, nz or maxallocmat? */
61*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
62*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
6307e43b41SHong Zhang 
64*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
65*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
66*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
67*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
6890d2215bSHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
692d1451d4SHong Zhang     }
702d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
712d1451d4SHong Zhang     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
722d1451d4SHong Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
732d1451d4SHong Zhang   }
742d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
752d1451d4SHong Zhang }
762d1451d4SHong Zhang 
7766976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
782d1451d4SHong Zhang {
792d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
802d1451d4SHong Zhang   MatScalar sum;
812d1451d4SHong Zhang   /* one thread per row. */
822d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
832d1451d4SHong Zhang   if (row < nrows) {
844e58db63SHong Zhang     slice_id     = row / sliceheight;
854e58db63SHong Zhang     row_in_slice = row % sliceheight;
862d1451d4SHong Zhang     sum          = 0.0;
874e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
882d1451d4SHong Zhang     y[row] = sum;
892d1451d4SHong Zhang   }
902d1451d4SHong Zhang }
912d1451d4SHong Zhang 
9266976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
932d1451d4SHong Zhang {
942d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
952d1451d4SHong Zhang   MatScalar sum;
962d1451d4SHong Zhang   /* one thread per row. */
972d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
982d1451d4SHong Zhang   if (row < nrows) {
994e58db63SHong Zhang     slice_id     = row / sliceheight;
1004e58db63SHong Zhang     row_in_slice = row % sliceheight;
1012d1451d4SHong Zhang     sum          = 0.0;
1024e58db63SHong Zhang     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
1032d1451d4SHong Zhang     z[row] = y[row] + sum;
1042d1451d4SHong Zhang   }
1052d1451d4SHong Zhang }
10607e43b41SHong Zhang 
1078711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
10807e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */
10907e43b41SHong Zhang template <int BLOCKY>
1104e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
11107e43b41SHong Zhang {
1124e58db63SHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
1134e58db63SHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
1144e58db63SHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
11507e43b41SHong Zhang   /* transposed index */
11607e43b41SHong Zhang   int         tidx = tid % BLOCKY;
11707e43b41SHong Zhang   int         tidy = tid / BLOCKY;
11807e43b41SHong Zhang   PetscScalar t    = 0.0;
1194e58db63SHong Zhang 
1204e58db63SHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
12107e43b41SHong Zhang   if (row < nrows) {
1224e58db63SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
1232d1451d4SHong Zhang   }
1244e58db63SHong Zhang   #pragma unroll
1254e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
12607e43b41SHong Zhang   /* transpose layout to reduce each row using warp shfl */
1271f0d1278SHong Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
12807e43b41SHong Zhang   __syncthreads();
1291f0d1278SHong Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
13007e43b41SHong Zhang   #pragma unroll
13107e43b41SHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
1324e58db63SHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
13307e43b41SHong Zhang   __syncthreads();
1344e58db63SHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
1352d1451d4SHong Zhang }
1362d1451d4SHong Zhang 
137cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */
138cca9ff8bSHong Zhang template <int BLOCKY>
139cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
140cca9ff8bSHong Zhang {
141cca9ff8bSHong Zhang   __shared__ MatScalar shared[32][BLOCKY];
142cca9ff8bSHong Zhang   PetscInt             i, row, slice_id = blockIdx.x;
143cca9ff8bSHong Zhang   int                  tid = threadIdx.x + threadIdx.y * 32;
144cca9ff8bSHong Zhang   /* transposed index */
145cca9ff8bSHong Zhang   int         tidx = tid % BLOCKY;
146cca9ff8bSHong Zhang   int         tidy = tid / BLOCKY;
147cca9ff8bSHong Zhang   PetscScalar t    = 0.0;
148cca9ff8bSHong Zhang 
149cca9ff8bSHong Zhang   row = slice_id * sliceheight + threadIdx.x % sliceheight;
150cca9ff8bSHong Zhang   if (row < nrows) {
151cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
152cca9ff8bSHong Zhang   }
153cca9ff8bSHong Zhang   #pragma unroll
154cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
155cca9ff8bSHong Zhang   /* transpose layout to reduce each row using warp shfl */
1561f0d1278SHong Zhang   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
157cca9ff8bSHong Zhang   __syncthreads();
1581f0d1278SHong Zhang   if (tidy < sliceheight) t = shared[tidy][tidx];
159cca9ff8bSHong Zhang   #pragma unroll
160cca9ff8bSHong Zhang   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
161cca9ff8bSHong Zhang   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
162cca9ff8bSHong Zhang   __syncthreads();
163cca9ff8bSHong Zhang   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
164cca9ff8bSHong Zhang }
165cca9ff8bSHong Zhang 
16690d2215bSHong Zhang template <int BLOCKY>
16766976f2fSJacob Faibussowitsch __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
16890d2215bSHong Zhang {
16990d2215bSHong Zhang   bool head = true;
17090d2215bSHong Zhang   #pragma unroll
17190d2215bSHong Zhang   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
17290d2215bSHong Zhang     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
17390d2215bSHong Zhang     shared[threadIdx.x + threadIdx.y * 32] = 0;
17490d2215bSHong Zhang     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
17590d2215bSHong Zhang       shared[threadIdx.x + threadIdx.y * 32] = *val;
17690d2215bSHong Zhang       if (i == 1) head = false;
17790d2215bSHong Zhang     }
17890d2215bSHong Zhang     __syncthreads();
17990d2215bSHong Zhang     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
18090d2215bSHong Zhang     __syncthreads();
18190d2215bSHong Zhang   }
18290d2215bSHong Zhang   return head;
18390d2215bSHong Zhang }
18490d2215bSHong Zhang 
18590d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
18690d2215bSHong Zhang template <int BLOCKY>
18790d2215bSHong Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
18890d2215bSHong Zhang {
18990d2215bSHong Zhang   __shared__ MatScalar shared[BLOCKY * 32];
19090d2215bSHong Zhang   PetscInt             gid, row, start_slice, cid;
19190d2215bSHong Zhang   PetscScalar          t = 0.0;
1926eb97cccSStefano Zampini   AtomicAdd<MatScalar> atomAdd;
19390d2215bSHong Zhang   /* zero out y */
19490d2215bSHong Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
19590d2215bSHong Zhang     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
19690d2215bSHong Zhang     if (gid < nrows) y[gid] = 0.0;
19790d2215bSHong Zhang   }
19890d2215bSHong Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
19990d2215bSHong Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
20090d2215bSHong Zhang     if (cid < totalchunks) {
20190d2215bSHong Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
20290d2215bSHong Zhang       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
20390d2215bSHong Zhang       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
20490d2215bSHong Zhang         __shared__ PetscInt flag[BLOCKY * 2];
20590d2215bSHong Zhang         bool                write;
20690d2215bSHong Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
20790d2215bSHong Zhang         /* find out the slice that this element belongs to */
20890d2215bSHong Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
20990d2215bSHong Zhang         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
21090d2215bSHong Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
21190d2215bSHong Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
21290d2215bSHong Zhang         __syncthreads();
21390d2215bSHong Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
2146eb97cccSStefano Zampini         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
21590d2215bSHong Zhang         t = 0.0;
21690d2215bSHong Zhang       } else { /* this iteration covers only one slice */
21790d2215bSHong Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
21890d2215bSHong Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
21990d2215bSHong Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
22090d2215bSHong Zhang           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
22190d2215bSHong Zhang   /* reduction and write to output vector */
22290d2215bSHong Zhang   #pragma unroll
22390d2215bSHong Zhang           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
22490d2215bSHong Zhang           /* transpose layout to reduce each row using warp shfl */
22590d2215bSHong Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
22690d2215bSHong Zhang           __syncthreads();
22790d2215bSHong Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
22890d2215bSHong Zhang   #pragma unroll
22990d2215bSHong Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
23090d2215bSHong Zhang           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
23190d2215bSHong Zhang           __syncthreads();
2326eb97cccSStefano Zampini           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
23390d2215bSHong Zhang           t = 0.0;
23490d2215bSHong Zhang         }
23590d2215bSHong Zhang       }
23690d2215bSHong Zhang     }
23790d2215bSHong Zhang   }
23890d2215bSHong Zhang }
23990d2215bSHong Zhang 
24090d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
24190d2215bSHong Zhang template <int BLOCKY>
24290d2215bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
24390d2215bSHong Zhang {
24490d2215bSHong Zhang   __shared__ MatScalar shared[BLOCKY * 32];
24590d2215bSHong Zhang   PetscInt             gid, row, start_slice, cid;
24690d2215bSHong Zhang   PetscScalar          t = 0.0;
2476eb97cccSStefano Zampini   AtomicAdd<MatScalar> atomAdd;
24890d2215bSHong Zhang   /* copy y to z */
24990d2215bSHong Zhang   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
25090d2215bSHong Zhang     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
25190d2215bSHong Zhang     if (gid < nrows) z[gid] = y[gid];
25290d2215bSHong Zhang   }
25390d2215bSHong Zhang   for (int iter = 0; iter < chunksperblock; iter++) {
25490d2215bSHong Zhang     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
25590d2215bSHong Zhang     if (cid < totalchunks) {
25690d2215bSHong Zhang       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
25790d2215bSHong Zhang       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
25890d2215bSHong Zhang       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
25990d2215bSHong Zhang         __shared__ PetscInt flag[BLOCKY * 2];
26090d2215bSHong Zhang         bool                write;
26190d2215bSHong Zhang         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
26290d2215bSHong Zhang         /* find out the slice that this element belongs to */
26390d2215bSHong Zhang         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
26490d2215bSHong Zhang         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
26590d2215bSHong Zhang         row = slice_id * sliceheight + threadIdx.x % sliceheight;
26690d2215bSHong Zhang         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
26790d2215bSHong Zhang         __syncthreads();
26890d2215bSHong Zhang         write = segment_scan<BLOCKY>(flag, shared, &t);
2696eb97cccSStefano Zampini         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
27090d2215bSHong Zhang         t = 0.0;
27190d2215bSHong Zhang       } else { /* this iteration covers only one slice */
27290d2215bSHong Zhang         row = start_slice * sliceheight + threadIdx.x % sliceheight;
27390d2215bSHong Zhang         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
27490d2215bSHong Zhang         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
27590d2215bSHong Zhang           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
27690d2215bSHong Zhang   /* reduction and write to output vector */
27790d2215bSHong Zhang   #pragma unroll
27890d2215bSHong Zhang           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
27990d2215bSHong Zhang           /* transpose layout to reduce each row using warp shfl */
28090d2215bSHong Zhang           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
28190d2215bSHong Zhang           __syncthreads();
28290d2215bSHong Zhang           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
28390d2215bSHong Zhang   #pragma unroll
28490d2215bSHong Zhang           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
28590d2215bSHong Zhang           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
28690d2215bSHong Zhang           __syncthreads();
2876eb97cccSStefano Zampini           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
28890d2215bSHong Zhang           t = 0.0;
28990d2215bSHong Zhang         }
29090d2215bSHong Zhang       }
29190d2215bSHong Zhang     }
29290d2215bSHong Zhang   }
29390d2215bSHong Zhang }
29490d2215bSHong Zhang 
29507e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */
29666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
2972d1451d4SHong Zhang {
29807e43b41SHong Zhang   PetscInt i, row, slice_id;
29907e43b41SHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
3004e58db63SHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
30107e43b41SHong Zhang   double t = 0.0;
30207e43b41SHong Zhang   if (row < nrows) {
30307e43b41SHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
30407e43b41SHong Zhang   }
3054e58db63SHong Zhang   #pragma unroll
3064e58db63SHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
3074e58db63SHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
30807e43b41SHong Zhang }
30907e43b41SHong Zhang 
310cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */
31166976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
312cca9ff8bSHong Zhang {
313cca9ff8bSHong Zhang   PetscInt i, row, slice_id;
314cca9ff8bSHong Zhang   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
315cca9ff8bSHong Zhang   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
316cca9ff8bSHong Zhang   double t = 0.0;
317cca9ff8bSHong Zhang   if (row < nrows) {
318cca9ff8bSHong Zhang     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
319cca9ff8bSHong Zhang   }
320cca9ff8bSHong Zhang   #pragma unroll
321cca9ff8bSHong Zhang   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
322cca9ff8bSHong Zhang   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
323cca9ff8bSHong Zhang }
3248711c661SHong Zhang #endif
325cca9ff8bSHong Zhang 
326a9dd396cSHong Zhang /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/
327a9dd396cSHong Zhang 
32866976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
32907e43b41SHong Zhang {
33007e43b41SHong Zhang   __shared__ MatScalar shared[512];
3312d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
33207e43b41SHong Zhang   /* multiple threads per row. */
3332d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
3342d1451d4SHong Zhang   if (row < nrows) {
3352d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
3362d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
3372d1451d4SHong Zhang 
3382d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3392d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
34007e43b41SHong Zhang     __syncthreads();
34107e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
34207e43b41SHong Zhang     __syncthreads();
34307e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
3442d1451d4SHong Zhang     __syncthreads();
3452d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
3462d1451d4SHong Zhang     __syncthreads();
3472d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
3482d1451d4SHong Zhang     __syncthreads();
3492d1451d4SHong Zhang     if (threadIdx.y < 1) {
35007e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
3512d1451d4SHong Zhang       y[row] = shared[threadIdx.x];
3522d1451d4SHong Zhang     }
3532d1451d4SHong Zhang   }
3542d1451d4SHong Zhang }
3552d1451d4SHong Zhang 
35666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
3572d1451d4SHong Zhang {
35807e43b41SHong Zhang   __shared__ MatScalar shared[512];
3592d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
36007e43b41SHong Zhang   /* multiple threads per row. */
3612d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
3622d1451d4SHong Zhang   if (row < nrows) {
3632d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
3642d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
3652d1451d4SHong Zhang 
3662d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3672d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
36807e43b41SHong Zhang     __syncthreads();
36907e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
3702d1451d4SHong Zhang     __syncthreads();
3712d1451d4SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
3722d1451d4SHong Zhang     __syncthreads();
3732d1451d4SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
3742d1451d4SHong Zhang     __syncthreads();
3752d1451d4SHong Zhang     if (threadIdx.y < 1) {
37607e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
37707e43b41SHong Zhang       y[row] = shared[threadIdx.x];
37807e43b41SHong Zhang     }
37907e43b41SHong Zhang   }
38007e43b41SHong Zhang }
38107e43b41SHong Zhang 
38266976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
38307e43b41SHong Zhang {
38407e43b41SHong Zhang   __shared__ MatScalar shared[512];
38507e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
38607e43b41SHong Zhang   /* multiple threads per row. */
38707e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
38807e43b41SHong Zhang   if (row < nrows) {
38907e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
39007e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
39107e43b41SHong Zhang 
39207e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
39307e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
39407e43b41SHong Zhang     __syncthreads();
39507e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
39607e43b41SHong Zhang     __syncthreads();
39707e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
39807e43b41SHong Zhang     __syncthreads();
39907e43b41SHong Zhang     if (threadIdx.y < 1) {
40007e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
40107e43b41SHong Zhang       y[row] = shared[threadIdx.x];
40207e43b41SHong Zhang     }
40307e43b41SHong Zhang   }
40407e43b41SHong Zhang }
40507e43b41SHong Zhang 
40666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
40707e43b41SHong Zhang {
40807e43b41SHong Zhang   __shared__ MatScalar shared[512];
40907e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
41007e43b41SHong Zhang   /* multiple threads per row. */
41107e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
41207e43b41SHong Zhang   if (row < nrows) {
41307e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
41407e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
41507e43b41SHong Zhang 
41607e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
41707e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
41807e43b41SHong Zhang     __syncthreads();
41907e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
42007e43b41SHong Zhang     __syncthreads();
42107e43b41SHong Zhang     if (threadIdx.y < 1) {
42207e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
42307e43b41SHong Zhang       y[row] = shared[threadIdx.x];
42407e43b41SHong Zhang     }
42507e43b41SHong Zhang   }
42607e43b41SHong Zhang }
42707e43b41SHong Zhang 
42866976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
42907e43b41SHong Zhang {
43007e43b41SHong Zhang   __shared__ MatScalar shared[512];
43107e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
43207e43b41SHong Zhang   /* multiple threads per row. */
43307e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
43407e43b41SHong Zhang   if (row < nrows) {
43507e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
43607e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
43707e43b41SHong Zhang 
43807e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
43907e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
44007e43b41SHong Zhang     __syncthreads();
44107e43b41SHong Zhang     if (threadIdx.y < 1) {
44207e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
44307e43b41SHong Zhang       y[row] = shared[threadIdx.x];
44407e43b41SHong Zhang     }
44507e43b41SHong Zhang   }
44607e43b41SHong Zhang }
44707e43b41SHong Zhang 
44866976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
44907e43b41SHong Zhang {
45007e43b41SHong Zhang   __shared__ MatScalar shared[512];
45107e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
45207e43b41SHong Zhang   /* multiple threads per row. */
45307e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
45407e43b41SHong Zhang   if (row < nrows) {
45507e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
45607e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
45707e43b41SHong Zhang 
45807e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
45907e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
46007e43b41SHong Zhang     __syncthreads();
46107e43b41SHong Zhang     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
46207e43b41SHong Zhang     __syncthreads();
46307e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
46407e43b41SHong Zhang     __syncthreads();
46507e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
46607e43b41SHong Zhang     __syncthreads();
46707e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
46807e43b41SHong Zhang     __syncthreads();
46907e43b41SHong Zhang     if (threadIdx.y < 1) {
47007e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
4712d1451d4SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
4722d1451d4SHong Zhang     }
4732d1451d4SHong Zhang   }
4742d1451d4SHong Zhang }
47507e43b41SHong Zhang 
47666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
47707e43b41SHong Zhang {
47807e43b41SHong Zhang   __shared__ MatScalar shared[512];
47907e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
48007e43b41SHong Zhang   /* multiple threads per row. */
48107e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
48207e43b41SHong Zhang   if (row < nrows) {
48307e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
48407e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
48507e43b41SHong Zhang 
48607e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
48707e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
48807e43b41SHong Zhang     __syncthreads();
48907e43b41SHong Zhang     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
49007e43b41SHong Zhang     __syncthreads();
49107e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
49207e43b41SHong Zhang     __syncthreads();
49307e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
49407e43b41SHong Zhang     __syncthreads();
49507e43b41SHong Zhang     if (threadIdx.y < 1) {
49607e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
49707e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
49807e43b41SHong Zhang     }
49907e43b41SHong Zhang   }
50007e43b41SHong Zhang }
50107e43b41SHong Zhang 
50266976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
50307e43b41SHong Zhang {
50407e43b41SHong Zhang   __shared__ MatScalar shared[512];
50507e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
50607e43b41SHong Zhang   /* multiple threads per row. */
50707e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
50807e43b41SHong Zhang   if (row < nrows) {
50907e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
51007e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
51107e43b41SHong Zhang 
51207e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
51307e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
51407e43b41SHong Zhang     __syncthreads();
51507e43b41SHong Zhang     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
51607e43b41SHong Zhang     __syncthreads();
51707e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
51807e43b41SHong Zhang     __syncthreads();
51907e43b41SHong Zhang     if (threadIdx.y < 1) {
52007e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
52107e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
52207e43b41SHong Zhang     }
52307e43b41SHong Zhang   }
52407e43b41SHong Zhang }
52507e43b41SHong Zhang 
52666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
52707e43b41SHong Zhang {
52807e43b41SHong Zhang   __shared__ MatScalar shared[512];
52907e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
53007e43b41SHong Zhang   /* multiple threads per row. */
53107e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
53207e43b41SHong Zhang   if (row < nrows) {
53307e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
53407e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
53507e43b41SHong Zhang 
53607e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
53707e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
53807e43b41SHong Zhang     __syncthreads();
53907e43b41SHong Zhang     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
54007e43b41SHong Zhang     __syncthreads();
54107e43b41SHong Zhang     if (threadIdx.y < 1) {
54207e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
54307e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
54407e43b41SHong Zhang     }
54507e43b41SHong Zhang   }
54607e43b41SHong Zhang }
54707e43b41SHong Zhang 
54866976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
54907e43b41SHong Zhang {
55007e43b41SHong Zhang   __shared__ MatScalar shared[512];
55107e43b41SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
55207e43b41SHong Zhang   /* multiple threads per row. */
55307e43b41SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
55407e43b41SHong Zhang   if (row < nrows) {
55507e43b41SHong Zhang     slice_id     = row / SLICE_HEIGHT;
55607e43b41SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
55707e43b41SHong Zhang 
55807e43b41SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
55907e43b41SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
56007e43b41SHong Zhang     __syncthreads();
56107e43b41SHong Zhang     if (threadIdx.y < 1) {
56207e43b41SHong Zhang       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
56307e43b41SHong Zhang       z[row] = y[row] + shared[threadIdx.x];
56407e43b41SHong Zhang     }
56507e43b41SHong Zhang   }
5662d1451d4SHong Zhang }
5672d1451d4SHong Zhang 
56866976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
5692d1451d4SHong Zhang {
5702d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
5712d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
5722d1451d4SHong Zhang   PetscScalar       *y;
5732d1451d4SHong Zhang   const PetscScalar *x;
574a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
5752d1451d4SHong Zhang   MatScalar         *aval;
5762d1451d4SHong Zhang   PetscInt          *acolidx;
5772d1451d4SHong Zhang   PetscInt          *sliidx;
57807e43b41SHong Zhang   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
57907e43b41SHong Zhang   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
58016a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX)
58116a9b8deSJose E. Roman   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
582b921024eSHong Zhang   PetscReal maxoveravg;
58316a9b8deSJose E. Roman #endif
5842d1451d4SHong Zhang 
5852d1451d4SHong Zhang   PetscFunctionBegin;
5864e58db63SHong Zhang   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
58790d2215bSHong Zhang   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
5882d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
5892d1451d4SHong Zhang   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
5902d1451d4SHong Zhang   aval    = cudastruct->val;
5912d1451d4SHong Zhang   acolidx = cudastruct->colidx;
5922d1451d4SHong Zhang   sliidx  = cudastruct->sliidx;
5932d1451d4SHong Zhang 
5942d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayRead(xx, &x));
5952d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayWrite(yy, &y));
5962d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeBegin());
59707e43b41SHong Zhang 
59807e43b41SHong Zhang   switch (cudastruct->kernelchoice) {
5998711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
60007e43b41SHong Zhang   case 9:
6014e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / sliceheight;
6024e58db63SHong Zhang     if (cudastruct->blocky == 2) {
6034e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6044e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
6054e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6064e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
6074e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6084e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
6094e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6104e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
6114e58db63SHong Zhang       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612cca9ff8bSHong Zhang     } else {
613cca9ff8bSHong Zhang       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
61407e43b41SHong Zhang     }
61507e43b41SHong Zhang     break;
61607e43b41SHong Zhang   case 7:
6174e58db63SHong Zhang     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
6184e58db63SHong Zhang     if (cudastruct->blocky == 2) {
6194e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6204e58db63SHong Zhang     } else if (cudastruct->blocky == 4) {
6214e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6224e58db63SHong Zhang     } else if (cudastruct->blocky == 8) {
6234e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6244e58db63SHong Zhang     } else if (cudastruct->blocky == 16) {
6254e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6264e58db63SHong Zhang     } else if (cudastruct->blocky == 32) {
6274e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6284e58db63SHong Zhang     } else {
6294e58db63SHong Zhang       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6304e58db63SHong Zhang     }
63107e43b41SHong Zhang     break;
6328711c661SHong Zhang #endif
63307e43b41SHong Zhang   case 6:
63407e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
635a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
63607e43b41SHong Zhang     break;
63707e43b41SHong Zhang   case 5:
63807e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
639a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
64007e43b41SHong Zhang     break;
64107e43b41SHong Zhang   case 4:
64207e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
643a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
64407e43b41SHong Zhang     break;
64507e43b41SHong Zhang   case 3:
64607e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
647a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
64807e43b41SHong Zhang     break;
64907e43b41SHong Zhang   case 2: /* 16 slices per block if blocksize=512 */
65007e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651a9dd396cSHong Zhang     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
65207e43b41SHong Zhang     break;
65307e43b41SHong Zhang   case 1: /* 32 slices per block if blocksize=512 */
65407e43b41SHong Zhang     nblocks = 1 + (nrows - 1) / blocksize;
6554e58db63SHong Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
65607e43b41SHong Zhang     break;
6578711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
65807e43b41SHong Zhang   case 0:
65990d2215bSHong Zhang     maxoveravg = a->maxslicewidth / a->avgslicewidth;
66090d2215bSHong Zhang     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
66190d2215bSHong Zhang       /* each block handles approximately one slice */
66290d2215bSHong Zhang       PetscInt blocky = a->chunksize / 32;
66390d2215bSHong Zhang       nchunks         = cudastruct->totalchunks;
66490d2215bSHong Zhang       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
66590d2215bSHong Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
66690d2215bSHong Zhang       chunk_slice_map = cudastruct->chunk_slice_map;
66790d2215bSHong Zhang       if (blocky == 2) {
66890d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
66990d2215bSHong Zhang       } else if (blocky == 4) {
67090d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67190d2215bSHong Zhang       } else if (blocky == 8) {
67290d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67390d2215bSHong Zhang       } else if (blocky == 16) {
67490d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67590d2215bSHong Zhang       } else if (blocky == 32) {
67690d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67790d2215bSHong Zhang       } else {
67890d2215bSHong Zhang         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67990d2215bSHong Zhang       }
6802d1451d4SHong Zhang     } else {
681cca9ff8bSHong Zhang       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
68290d2215bSHong Zhang       if (avgslicesize <= 432) {
68390d2215bSHong Zhang         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
684cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
685cca9ff8bSHong Zhang           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
68690d2215bSHong Zhang         } else {
687cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
688cca9ff8bSHong Zhang           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
68990d2215bSHong Zhang         }
690cca9ff8bSHong Zhang       } else if (avgslicesize <= 2400) {
691cca9ff8bSHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
692cca9ff8bSHong Zhang         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
693cca9ff8bSHong Zhang       } else {
6944e58db63SHong Zhang         nblocks = 1 + (nrows - 1) / sliceheight;
6954e58db63SHong Zhang         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6962d1451d4SHong Zhang       }
6972d1451d4SHong Zhang     }
698cca9ff8bSHong Zhang     break;
6998711c661SHong Zhang #endif
7008711c661SHong Zhang   default:
7018711c661SHong Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
702cca9ff8bSHong Zhang   }
7032d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeEnd());
7042d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayRead(xx, &x));
7052d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
7062d1451d4SHong Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
7072d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
7082d1451d4SHong Zhang }
7092d1451d4SHong Zhang 
71066976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
7112d1451d4SHong Zhang {
7122d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
7132d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
7142d1451d4SHong Zhang   PetscScalar       *z;
7152d1451d4SHong Zhang   const PetscScalar *y, *x;
716a9dd396cSHong Zhang   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
7172d1451d4SHong Zhang   MatScalar         *aval    = cudastruct->val;
7182d1451d4SHong Zhang   PetscInt          *acolidx = cudastruct->colidx;
7192d1451d4SHong Zhang   PetscInt          *sliidx  = cudastruct->sliidx;
72016a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX)
72190d2215bSHong Zhang   PetscReal maxoveravg;
72216a9b8deSJose E. Roman   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
72316a9b8deSJose E. Roman   PetscInt  blocky = cudastruct->blocky;
72416a9b8deSJose E. Roman #endif
7252d1451d4SHong Zhang 
7262d1451d4SHong Zhang   PetscFunctionBegin;
72790d2215bSHong Zhang   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
72890d2215bSHong Zhang   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
7292d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
7302d1451d4SHong Zhang   if (a->nz) {
73116a9b8deSJose E. Roman     PetscInt nblocks, blocksize = 512;
73207e43b41SHong Zhang     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
7332d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(xx, &x));
7342d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(yy, &y));
7352d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayWrite(zz, &z));
7362d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeBegin());
73707e43b41SHong Zhang 
73807e43b41SHong Zhang     switch (cudastruct->kernelchoice) {
7398711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
740cca9ff8bSHong Zhang     case 9:
741cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / sliceheight;
74290d2215bSHong Zhang       if (blocky == 2) {
743cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74490d2215bSHong Zhang       } else if (blocky == 4) {
745cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74690d2215bSHong Zhang       } else if (blocky == 8) {
747cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74890d2215bSHong Zhang       } else if (blocky == 16) {
749cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
75090d2215bSHong Zhang       } else if (blocky == 32) {
751cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752cca9ff8bSHong Zhang       } else {
753cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
754cca9ff8bSHong Zhang       }
755cca9ff8bSHong Zhang       break;
75690d2215bSHong Zhang     case 8:
75790d2215bSHong Zhang       /* each block handles approximately one slice */
75890d2215bSHong Zhang       nchunks         = cudastruct->totalchunks;
75990d2215bSHong Zhang       blocky          = a->chunksize / 32;
76090d2215bSHong Zhang       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
76190d2215bSHong Zhang       nblocks         = 1 + (nchunks - 1) / chunksperblock;
76290d2215bSHong Zhang       chunk_slice_map = cudastruct->chunk_slice_map;
76390d2215bSHong Zhang       if (blocky == 2) {
76490d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76590d2215bSHong Zhang       } else if (blocky == 4) {
76690d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76790d2215bSHong Zhang       } else if (blocky == 8) {
76890d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76990d2215bSHong Zhang       } else if (blocky == 16) {
77090d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
77190d2215bSHong Zhang       } else if (blocky == 32) {
77290d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
77390d2215bSHong Zhang       } else {
77490d2215bSHong Zhang         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
77590d2215bSHong Zhang       }
77690d2215bSHong Zhang       break;
777cca9ff8bSHong Zhang     case 7:
778cca9ff8bSHong Zhang       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
77990d2215bSHong Zhang       if (blocky == 2) {
780cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78190d2215bSHong Zhang       } else if (blocky == 4) {
782cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78390d2215bSHong Zhang       } else if (blocky == 8) {
784cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78590d2215bSHong Zhang       } else if (blocky == 16) {
786cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78790d2215bSHong Zhang       } else if (blocky == 32) {
788cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789cca9ff8bSHong Zhang       } else {
790cca9ff8bSHong Zhang         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
791cca9ff8bSHong Zhang       }
792cca9ff8bSHong Zhang       break;
7938711c661SHong Zhang #endif
79407e43b41SHong Zhang     case 6:
79507e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 32);
796a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
79707e43b41SHong Zhang       break;
79807e43b41SHong Zhang     case 5:
79907e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 16);
800a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
80107e43b41SHong Zhang       break;
80207e43b41SHong Zhang     case 4:
80307e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 8);
804a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
80507e43b41SHong Zhang       break;
80607e43b41SHong Zhang     case 3:
80707e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 4);
808a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
80907e43b41SHong Zhang       break;
81007e43b41SHong Zhang     case 2:
81107e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / (blocksize / 2);
812a9dd396cSHong Zhang       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
81307e43b41SHong Zhang       break;
81407e43b41SHong Zhang     case 1:
81507e43b41SHong Zhang       nblocks = 1 + (nrows - 1) / blocksize;
8164e58db63SHong Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
81707e43b41SHong Zhang       break;
8188711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
819cca9ff8bSHong Zhang     case 0:
82090d2215bSHong Zhang       maxoveravg = a->maxslicewidth / a->avgslicewidth;
82190d2215bSHong Zhang       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
82290d2215bSHong Zhang         /* each block handles approximately one slice */
82390d2215bSHong Zhang         nchunks         = cudastruct->totalchunks;
82490d2215bSHong Zhang         blocky          = a->chunksize / 32;
82590d2215bSHong Zhang         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
82690d2215bSHong Zhang         nblocks         = 1 + (nchunks - 1) / chunksperblock;
82790d2215bSHong Zhang         chunk_slice_map = cudastruct->chunk_slice_map;
82890d2215bSHong Zhang         if (blocky == 2) {
82990d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83090d2215bSHong Zhang         } else if (blocky == 4) {
83190d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83290d2215bSHong Zhang         } else if (blocky == 8) {
83390d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83490d2215bSHong Zhang         } else if (blocky == 16) {
83590d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83690d2215bSHong Zhang         } else if (blocky == 32) {
83790d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83890d2215bSHong Zhang         } else {
83990d2215bSHong Zhang           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
84090d2215bSHong Zhang         }
841cca9ff8bSHong Zhang       } else {
842cca9ff8bSHong Zhang         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
84390d2215bSHong Zhang         if (avgslicesize <= 432) {
84490d2215bSHong Zhang           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
845cca9ff8bSHong Zhang             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
846cca9ff8bSHong Zhang             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
84790d2215bSHong Zhang           } else {
848cca9ff8bSHong Zhang             nblocks = 1 + (nrows - 1) / sliceheight;
849cca9ff8bSHong Zhang             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
85090d2215bSHong Zhang           }
851cca9ff8bSHong Zhang         } else if (avgslicesize <= 2400) {
852cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
853cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
854cca9ff8bSHong Zhang         } else {
855cca9ff8bSHong Zhang           nblocks = 1 + (nrows - 1) / sliceheight;
856cca9ff8bSHong Zhang           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
857cca9ff8bSHong Zhang         }
858cca9ff8bSHong Zhang       }
85907e43b41SHong Zhang       break;
8608711c661SHong Zhang #endif
8618711c661SHong Zhang     default:
8628711c661SHong Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
8632d1451d4SHong Zhang     }
8642d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeEnd());
8652d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(xx, &x));
8662d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(yy, &y));
8672d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
8682d1451d4SHong Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
8692d1451d4SHong Zhang   } else {
8702d1451d4SHong Zhang     PetscCall(VecCopy(yy, zz));
8712d1451d4SHong Zhang   }
8722d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
8732d1451d4SHong Zhang }
8742d1451d4SHong Zhang 
8752d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
8762d1451d4SHong Zhang {
87707e43b41SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
8784e58db63SHong Zhang   PetscInt         kernel, blocky;
87907e43b41SHong Zhang   PetscBool        flg;
88007e43b41SHong Zhang 
8812d1451d4SHong Zhang   PetscFunctionBegin;
8822d1451d4SHong Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
88390d2215bSHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
88490d2215bSHong Zhang   if (flg) {
88590d2215bSHong Zhang     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
88690d2215bSHong Zhang     cudastruct->blocky = blocky;
88790d2215bSHong Zhang   }
88807e43b41SHong Zhang   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
88907e43b41SHong Zhang   if (flg) {
89007e43b41SHong Zhang     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
89107e43b41SHong Zhang     cudastruct->kernelchoice = kernel;
89290d2215bSHong Zhang     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
8934e58db63SHong Zhang   }
8942d1451d4SHong Zhang   PetscOptionsHeadEnd();
8952d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
8962d1451d4SHong Zhang }
8972d1451d4SHong Zhang 
89807e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
89907e43b41SHong Zhang {
90007e43b41SHong Zhang   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
90107e43b41SHong Zhang 
90290d2215bSHong Zhang   PetscFunctionBegin;
90307e43b41SHong Zhang   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
90407e43b41SHong Zhang   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
90507e43b41SHong Zhang   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
90607e43b41SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
90707e43b41SHong Zhang }
90807e43b41SHong Zhang 
9092d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
9102d1451d4SHong Zhang {
9112d1451d4SHong Zhang   PetscFunctionBegin;
9122d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
91307e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
9142d1451d4SHong Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
9152d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
9162d1451d4SHong Zhang   A->ops->mult    = MatMult_SeqSELLCUDA;
9172d1451d4SHong Zhang   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
9182d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9192d1451d4SHong Zhang }
9202d1451d4SHong Zhang 
9218df136f9SHong Zhang static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
9228df136f9SHong Zhang {
9238df136f9SHong Zhang   PetscBool    both = PETSC_FALSE;
9248df136f9SHong Zhang   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;
9258df136f9SHong Zhang 
9268df136f9SHong Zhang   PetscFunctionBegin;
9278df136f9SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) {
9288df136f9SHong Zhang     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
9298df136f9SHong Zhang     if (cudastruct->val) {
9308df136f9SHong Zhang       both = PETSC_TRUE;
931*f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
9328df136f9SHong Zhang     }
9338df136f9SHong Zhang   }
9348df136f9SHong Zhang   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
9358df136f9SHong Zhang   PetscCall(MatSeqSELLInvalidateDiagonal(A));
9368df136f9SHong Zhang   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
9378df136f9SHong Zhang   else A->offloadmask = PETSC_OFFLOAD_CPU;
9388df136f9SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9398df136f9SHong Zhang }
9408df136f9SHong Zhang 
9412d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
9422d1451d4SHong Zhang {
9432d1451d4SHong Zhang   PetscFunctionBegin;
9448df136f9SHong Zhang   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
9452d1451d4SHong Zhang   PetscCall(MatDestroy_SeqSELL(A));
9462d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9472d1451d4SHong Zhang }
9482d1451d4SHong Zhang 
9492d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
9502d1451d4SHong Zhang static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
9512d1451d4SHong Zhang {
9522d1451d4SHong Zhang   PetscFunctionBegin;
9532d1451d4SHong Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
9542d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
9552d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9562d1451d4SHong Zhang }
9572d1451d4SHong Zhang 
9588df136f9SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
9592d1451d4SHong Zhang {
9602d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct;
9612d1451d4SHong Zhang 
9622d1451d4SHong Zhang   PetscFunctionBegin;
9632d1451d4SHong Zhang   PetscCall(PetscFree(B->defaultvectype));
9642d1451d4SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
9652d1451d4SHong Zhang 
9662d1451d4SHong Zhang   if (!B->spptr) {
9672d1451d4SHong Zhang     if (B->factortype == MAT_FACTOR_NONE) {
9682d1451d4SHong Zhang       PetscCall(PetscNew(&cudastruct));
9692d1451d4SHong Zhang       B->spptr = cudastruct;
9702d1451d4SHong Zhang     }
9712d1451d4SHong Zhang   }
9722d1451d4SHong Zhang 
9732d1451d4SHong Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
9742d1451d4SHong Zhang   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
9752d1451d4SHong Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
9762d1451d4SHong Zhang   B->ops->mult           = MatMult_SeqSELLCUDA;
9772d1451d4SHong Zhang   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
9782d1451d4SHong Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
9798df136f9SHong Zhang   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;
9802d1451d4SHong Zhang 
98107e43b41SHong Zhang   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
98207e43b41SHong Zhang   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
98307e43b41SHong Zhang 
9842d1451d4SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
9852d1451d4SHong Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
9862d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9872d1451d4SHong Zhang }
9882d1451d4SHong Zhang 
9892d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
9902d1451d4SHong Zhang {
9912d1451d4SHong Zhang   PetscFunctionBegin;
9922d1451d4SHong Zhang   PetscCall(MatCreate_SeqSELL(B));
9932d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
9948df136f9SHong Zhang   PetscCall(MatSetFromOptions(B));
9952d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
9962d1451d4SHong Zhang }
997