12d1451d4SHong Zhang #include <petscdevice_cuda.h>
26eb97cccSStefano Zampini #include <petsc/private/cupmatomics.hpp>
32d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/
42d1451d4SHong Zhang
507e43b41SHong Zhang #define SLICE_HEIGHT 16
607e43b41SHong Zhang
72d1451d4SHong Zhang typedef struct {
890d2215bSHong Zhang PetscInt maxallocmat;
990d2215bSHong Zhang PetscInt totalentries;
1090d2215bSHong Zhang PetscInt *colidx; /* column index array, device pointer */
1190d2215bSHong Zhang MatScalar *val; /* value array, device pointer */
1290d2215bSHong Zhang PetscInt totalslices;
1390d2215bSHong Zhang PetscInt *sliidx; /* slice index array, device pointer */
142d1451d4SHong Zhang PetscInt nonzerostate;
1507e43b41SHong Zhang PetscInt kernelchoice;
164e58db63SHong Zhang PetscInt blocky;
1790d2215bSHong Zhang PetscInt chunksperblock;
1890d2215bSHong Zhang PetscInt totalchunks;
1990d2215bSHong Zhang PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
202d1451d4SHong Zhang } Mat_SeqSELLCUDA;
212d1451d4SHong Zhang
MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA ** cudastruct)222d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
232d1451d4SHong Zhang {
242d1451d4SHong Zhang PetscFunctionBegin;
252d1451d4SHong Zhang if (*cudastruct) {
263a7d0413SPierre Jolivet if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx));
273a7d0413SPierre Jolivet if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val));
283a7d0413SPierre Jolivet if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx));
293a7d0413SPierre Jolivet if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map));
302d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct));
312d1451d4SHong Zhang }
322d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
332d1451d4SHong Zhang }
342d1451d4SHong Zhang
MatSeqSELLCUDACopyToGPU(Mat A)352d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
362d1451d4SHong Zhang {
372d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
382d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
392d1451d4SHong Zhang
402d1451d4SHong Zhang PetscFunctionBegin;
412d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
422d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
432d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
442d1451d4SHong Zhang /* copy values only */
452d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
462d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
472d1451d4SHong Zhang } else {
483a7d0413SPierre Jolivet if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx));
493a7d0413SPierre Jolivet if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val));
503a7d0413SPierre Jolivet if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx));
513a7d0413SPierre Jolivet if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map));
5290d2215bSHong Zhang cudastruct->maxallocmat = a->maxallocmat;
5390d2215bSHong Zhang cudastruct->totalentries = a->sliidx[a->totalslices];
5490d2215bSHong Zhang cudastruct->totalslices = a->totalslices;
5590d2215bSHong Zhang cudastruct->totalchunks = a->totalchunks;
56f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
57f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
582d1451d4SHong Zhang /* copy values, nz or maxallocmat? */
59f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
60f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
6107e43b41SHong Zhang
62f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
63f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
64f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
65f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
6690d2215bSHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
672d1451d4SHong Zhang }
682d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA());
692d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
702d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH;
712d1451d4SHong Zhang }
722d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
732d1451d4SHong Zhang }
742d1451d4SHong Zhang
matmult_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)7566976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
762d1451d4SHong Zhang {
772d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice;
782d1451d4SHong Zhang MatScalar sum;
792d1451d4SHong Zhang /* one thread per row. */
802d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
812d1451d4SHong Zhang if (row < nrows) {
824e58db63SHong Zhang slice_id = row / sliceheight;
834e58db63SHong Zhang row_in_slice = row % sliceheight;
842d1451d4SHong Zhang sum = 0.0;
854e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
862d1451d4SHong Zhang y[row] = sum;
872d1451d4SHong Zhang }
882d1451d4SHong Zhang }
892d1451d4SHong Zhang
matmultadd_seqsell_basic_kernel(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)9066976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
912d1451d4SHong Zhang {
922d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice;
932d1451d4SHong Zhang MatScalar sum;
942d1451d4SHong Zhang /* one thread per row. */
952d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
962d1451d4SHong Zhang if (row < nrows) {
974e58db63SHong Zhang slice_id = row / sliceheight;
984e58db63SHong Zhang row_in_slice = row % sliceheight;
992d1451d4SHong Zhang sum = 0.0;
1004e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
1012d1451d4SHong Zhang z[row] = y[row] + sum;
1022d1451d4SHong Zhang }
1032d1451d4SHong Zhang }
10407e43b41SHong Zhang
1058711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
10607e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */
10707e43b41SHong Zhang template <int BLOCKY>
matmult_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)1084e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
10907e43b41SHong Zhang {
1104e58db63SHong Zhang __shared__ MatScalar shared[32][BLOCKY];
1114e58db63SHong Zhang PetscInt i, row, slice_id = blockIdx.x;
1124e58db63SHong Zhang int tid = threadIdx.x + threadIdx.y * 32;
11307e43b41SHong Zhang /* transposed index */
11407e43b41SHong Zhang int tidx = tid % BLOCKY;
11507e43b41SHong Zhang int tidy = tid / BLOCKY;
11607e43b41SHong Zhang PetscScalar t = 0.0;
1174e58db63SHong Zhang
1184e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
11907e43b41SHong Zhang if (row < nrows) {
1204e58db63SHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
1212d1451d4SHong Zhang }
1224e58db63SHong Zhang #pragma unroll
123*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
12407e43b41SHong Zhang /* transpose layout to reduce each row using warp shfl */
1251f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
12607e43b41SHong Zhang __syncthreads();
1271f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx];
12807e43b41SHong Zhang #pragma unroll
129*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
130*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
13107e43b41SHong Zhang __syncthreads();
132*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
1332d1451d4SHong Zhang }
1342d1451d4SHong Zhang
135cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */
136cca9ff8bSHong Zhang template <int BLOCKY>
matmultadd_seqsell_tiled_kernel9(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)137cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138cca9ff8bSHong Zhang {
139cca9ff8bSHong Zhang __shared__ MatScalar shared[32][BLOCKY];
140cca9ff8bSHong Zhang PetscInt i, row, slice_id = blockIdx.x;
141cca9ff8bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32;
142cca9ff8bSHong Zhang /* transposed index */
143cca9ff8bSHong Zhang int tidx = tid % BLOCKY;
144cca9ff8bSHong Zhang int tidy = tid / BLOCKY;
145cca9ff8bSHong Zhang PetscScalar t = 0.0;
146cca9ff8bSHong Zhang
147cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
148cca9ff8bSHong Zhang if (row < nrows) {
149cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150cca9ff8bSHong Zhang }
151cca9ff8bSHong Zhang #pragma unroll
152*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
153cca9ff8bSHong Zhang /* transpose layout to reduce each row using warp shfl */
1541f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155cca9ff8bSHong Zhang __syncthreads();
1561f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx];
157cca9ff8bSHong Zhang #pragma unroll
158*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
159*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
160cca9ff8bSHong Zhang __syncthreads();
161*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
162cca9ff8bSHong Zhang }
163cca9ff8bSHong Zhang
16490d2215bSHong Zhang template <int BLOCKY>
segment_scan(PetscInt flag[],MatScalar shared[],PetscScalar * val)16566976f2fSJacob Faibussowitsch __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
16690d2215bSHong Zhang {
16790d2215bSHong Zhang bool head = true;
16890d2215bSHong Zhang #pragma unroll
16990d2215bSHong Zhang for (int i = 1; i < BLOCKY * 2; i <<= 1) {
17090d2215bSHong Zhang int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16;
17190d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = 0;
17290d2215bSHong Zhang if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
17390d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = *val;
17490d2215bSHong Zhang if (i == 1) head = false;
17590d2215bSHong Zhang }
17690d2215bSHong Zhang __syncthreads();
17790d2215bSHong Zhang if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
17890d2215bSHong Zhang __syncthreads();
17990d2215bSHong Zhang }
18090d2215bSHong Zhang return head;
18190d2215bSHong Zhang }
18290d2215bSHong Zhang
18390d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
18490d2215bSHong Zhang template <int BLOCKY>
matmult_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)18590d2215bSHong Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
18690d2215bSHong Zhang {
18790d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32];
18890d2215bSHong Zhang PetscInt gid, row, start_slice, cid;
18990d2215bSHong Zhang PetscScalar t = 0.0;
1906eb97cccSStefano Zampini AtomicAdd<MatScalar> atomAdd;
19190d2215bSHong Zhang /* zero out y */
19290d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
19390d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
19490d2215bSHong Zhang if (gid < nrows) y[gid] = 0.0;
19590d2215bSHong Zhang }
19690d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) {
19790d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */
19890d2215bSHong Zhang if (cid < totalchunks) {
19990d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
20090d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
20190d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
20290d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2];
20390d2215bSHong Zhang bool write;
204b7c0efcaSStefano Zampini PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
20590d2215bSHong Zhang /* find out the slice that this element belongs to */
20690d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
20790d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
20890d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
20990d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
21090d2215bSHong Zhang __syncthreads();
21190d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t);
2126eb97cccSStefano Zampini if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
21390d2215bSHong Zhang t = 0.0;
21490d2215bSHong Zhang } else { /* this iteration covers only one slice */
21590d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight;
21690d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]];
21790d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
21890d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
21990d2215bSHong Zhang /* reduction and write to output vector */
22090d2215bSHong Zhang #pragma unroll
221*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
22290d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */
22390d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
22490d2215bSHong Zhang __syncthreads();
22590d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
22690d2215bSHong Zhang #pragma unroll
227*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
228*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
22990d2215bSHong Zhang __syncthreads();
2306eb97cccSStefano Zampini if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
23190d2215bSHong Zhang t = 0.0;
23290d2215bSHong Zhang }
23390d2215bSHong Zhang }
23490d2215bSHong Zhang }
23590d2215bSHong Zhang }
23690d2215bSHong Zhang }
23790d2215bSHong Zhang
23890d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */
23990d2215bSHong Zhang template <int BLOCKY>
matmultadd_seqsell_tiled_kernel8(PetscInt nrows,PetscInt sliceheight,PetscInt chunksperblock,PetscInt totalchunks,const PetscInt * chunk_slice_map,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)24090d2215bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
24190d2215bSHong Zhang {
24290d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32];
24390d2215bSHong Zhang PetscInt gid, row, start_slice, cid;
24490d2215bSHong Zhang PetscScalar t = 0.0;
2456eb97cccSStefano Zampini AtomicAdd<MatScalar> atomAdd;
24690d2215bSHong Zhang /* copy y to z */
24790d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
24890d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
24990d2215bSHong Zhang if (gid < nrows) z[gid] = y[gid];
25090d2215bSHong Zhang }
25190d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) {
25290d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */
25390d2215bSHong Zhang if (cid < totalchunks) {
25490d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
25590d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
25690d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
25790d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2];
25890d2215bSHong Zhang bool write;
259b7c0efcaSStefano Zampini PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
26090d2215bSHong Zhang /* find out the slice that this element belongs to */
26190d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
26290d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
26390d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
26490d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
26590d2215bSHong Zhang __syncthreads();
26690d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t);
2676eb97cccSStefano Zampini if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
26890d2215bSHong Zhang t = 0.0;
26990d2215bSHong Zhang } else { /* this iteration covers only one slice */
27090d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight;
27190d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]];
27290d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
27390d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
27490d2215bSHong Zhang /* reduction and write to output vector */
27590d2215bSHong Zhang #pragma unroll
276*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
27790d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */
27890d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
27990d2215bSHong Zhang __syncthreads();
28090d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
28190d2215bSHong Zhang #pragma unroll
282*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
283*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
28490d2215bSHong Zhang __syncthreads();
2856eb97cccSStefano Zampini if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
28690d2215bSHong Zhang t = 0.0;
28790d2215bSHong Zhang }
28890d2215bSHong Zhang }
28990d2215bSHong Zhang }
29090d2215bSHong Zhang }
29190d2215bSHong Zhang }
29290d2215bSHong Zhang
29307e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */
matmult_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)29466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
2952d1451d4SHong Zhang {
29607e43b41SHong Zhang PetscInt i, row, slice_id;
29707e43b41SHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y;
2984e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
29907e43b41SHong Zhang double t = 0.0;
30007e43b41SHong Zhang if (row < nrows) {
30107e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
30207e43b41SHong Zhang }
3034e58db63SHong Zhang #pragma unroll
304*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
305*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
30607e43b41SHong Zhang }
30707e43b41SHong Zhang
308cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */
matmultadd_seqsell_tiled_kernel7(PetscInt nrows,PetscInt sliceheight,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)30966976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
310cca9ff8bSHong Zhang {
311cca9ff8bSHong Zhang PetscInt i, row, slice_id;
312cca9ff8bSHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y;
313cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight;
314cca9ff8bSHong Zhang double t = 0.0;
315cca9ff8bSHong Zhang if (row < nrows) {
316cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
317cca9ff8bSHong Zhang }
318cca9ff8bSHong Zhang #pragma unroll
319*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
320*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
321cca9ff8bSHong Zhang }
3228711c661SHong Zhang #endif
323cca9ff8bSHong Zhang
324a9dd396cSHong Zhang /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/
325a9dd396cSHong Zhang
matmult_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)32666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
32707e43b41SHong Zhang {
32807e43b41SHong Zhang __shared__ MatScalar shared[512];
3292d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice;
33007e43b41SHong Zhang /* multiple threads per row. */
3312d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
3322d1451d4SHong Zhang if (row < nrows) {
3332d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT;
3342d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT;
3352d1451d4SHong Zhang
3362d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3372d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
33807e43b41SHong Zhang __syncthreads();
339*ac530a7eSPierre Jolivet if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
34007e43b41SHong Zhang __syncthreads();
341*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
3422d1451d4SHong Zhang __syncthreads();
343*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
3442d1451d4SHong Zhang __syncthreads();
345*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
3462d1451d4SHong Zhang __syncthreads();
3472d1451d4SHong Zhang if (threadIdx.y < 1) {
34807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
3492d1451d4SHong Zhang y[row] = shared[threadIdx.x];
3502d1451d4SHong Zhang }
3512d1451d4SHong Zhang }
3522d1451d4SHong Zhang }
3532d1451d4SHong Zhang
matmult_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)35466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
3552d1451d4SHong Zhang {
35607e43b41SHong Zhang __shared__ MatScalar shared[512];
3572d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice;
35807e43b41SHong Zhang /* multiple threads per row. */
3592d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
3602d1451d4SHong Zhang if (row < nrows) {
3612d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT;
3622d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT;
3632d1451d4SHong Zhang
3642d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
3652d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
36607e43b41SHong Zhang __syncthreads();
367*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
3682d1451d4SHong Zhang __syncthreads();
369*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
3702d1451d4SHong Zhang __syncthreads();
371*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
3722d1451d4SHong Zhang __syncthreads();
3732d1451d4SHong Zhang if (threadIdx.y < 1) {
37407e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
37507e43b41SHong Zhang y[row] = shared[threadIdx.x];
37607e43b41SHong Zhang }
37707e43b41SHong Zhang }
37807e43b41SHong Zhang }
37907e43b41SHong Zhang
matmult_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)38066976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
38107e43b41SHong Zhang {
38207e43b41SHong Zhang __shared__ MatScalar shared[512];
38307e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
38407e43b41SHong Zhang /* multiple threads per row. */
38507e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
38607e43b41SHong Zhang if (row < nrows) {
38707e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
38807e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
38907e43b41SHong Zhang
39007e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
39107e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
39207e43b41SHong Zhang __syncthreads();
393*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
39407e43b41SHong Zhang __syncthreads();
395*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
39607e43b41SHong Zhang __syncthreads();
39707e43b41SHong Zhang if (threadIdx.y < 1) {
39807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
39907e43b41SHong Zhang y[row] = shared[threadIdx.x];
40007e43b41SHong Zhang }
40107e43b41SHong Zhang }
40207e43b41SHong Zhang }
40307e43b41SHong Zhang
matmult_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)40466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
40507e43b41SHong Zhang {
40607e43b41SHong Zhang __shared__ MatScalar shared[512];
40707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
40807e43b41SHong Zhang /* multiple threads per row. */
40907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
41007e43b41SHong Zhang if (row < nrows) {
41107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
41207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
41307e43b41SHong Zhang
41407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
41507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
41607e43b41SHong Zhang __syncthreads();
417*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
41807e43b41SHong Zhang __syncthreads();
41907e43b41SHong Zhang if (threadIdx.y < 1) {
42007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
42107e43b41SHong Zhang y[row] = shared[threadIdx.x];
42207e43b41SHong Zhang }
42307e43b41SHong Zhang }
42407e43b41SHong Zhang }
42507e43b41SHong Zhang
matmult_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,PetscScalar * y)42666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
42707e43b41SHong Zhang {
42807e43b41SHong Zhang __shared__ MatScalar shared[512];
42907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
43007e43b41SHong Zhang /* multiple threads per row. */
43107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
43207e43b41SHong Zhang if (row < nrows) {
43307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
43407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
43507e43b41SHong Zhang
43607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
43707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
43807e43b41SHong Zhang __syncthreads();
43907e43b41SHong Zhang if (threadIdx.y < 1) {
44007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
44107e43b41SHong Zhang y[row] = shared[threadIdx.x];
44207e43b41SHong Zhang }
44307e43b41SHong Zhang }
44407e43b41SHong Zhang }
44507e43b41SHong Zhang
matmultadd_seqsell_tiled_kernel6(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)44666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
44707e43b41SHong Zhang {
44807e43b41SHong Zhang __shared__ MatScalar shared[512];
44907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
45007e43b41SHong Zhang /* multiple threads per row. */
45107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
45207e43b41SHong Zhang if (row < nrows) {
45307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
45407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
45507e43b41SHong Zhang
45607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
45707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
45807e43b41SHong Zhang __syncthreads();
459*ac530a7eSPierre Jolivet if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
46007e43b41SHong Zhang __syncthreads();
461*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
46207e43b41SHong Zhang __syncthreads();
463*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
46407e43b41SHong Zhang __syncthreads();
465*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
46607e43b41SHong Zhang __syncthreads();
46707e43b41SHong Zhang if (threadIdx.y < 1) {
46807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
4692d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x];
4702d1451d4SHong Zhang }
4712d1451d4SHong Zhang }
4722d1451d4SHong Zhang }
47307e43b41SHong Zhang
matmultadd_seqsell_tiled_kernel5(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)47466976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
47507e43b41SHong Zhang {
47607e43b41SHong Zhang __shared__ MatScalar shared[512];
47707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
47807e43b41SHong Zhang /* multiple threads per row. */
47907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
48007e43b41SHong Zhang if (row < nrows) {
48107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
48207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
48307e43b41SHong Zhang
48407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
48507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
48607e43b41SHong Zhang __syncthreads();
487*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
48807e43b41SHong Zhang __syncthreads();
489*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
49007e43b41SHong Zhang __syncthreads();
491*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
49207e43b41SHong Zhang __syncthreads();
49307e43b41SHong Zhang if (threadIdx.y < 1) {
49407e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
49507e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x];
49607e43b41SHong Zhang }
49707e43b41SHong Zhang }
49807e43b41SHong Zhang }
49907e43b41SHong Zhang
matmultadd_seqsell_tiled_kernel4(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)50066976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
50107e43b41SHong Zhang {
50207e43b41SHong Zhang __shared__ MatScalar shared[512];
50307e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
50407e43b41SHong Zhang /* multiple threads per row. */
50507e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
50607e43b41SHong Zhang if (row < nrows) {
50707e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
50807e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
50907e43b41SHong Zhang
51007e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
51107e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
51207e43b41SHong Zhang __syncthreads();
513*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
51407e43b41SHong Zhang __syncthreads();
515*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
51607e43b41SHong Zhang __syncthreads();
51707e43b41SHong Zhang if (threadIdx.y < 1) {
51807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
51907e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x];
52007e43b41SHong Zhang }
52107e43b41SHong Zhang }
52207e43b41SHong Zhang }
52307e43b41SHong Zhang
matmultadd_seqsell_tiled_kernel3(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)52466976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
52507e43b41SHong Zhang {
52607e43b41SHong Zhang __shared__ MatScalar shared[512];
52707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
52807e43b41SHong Zhang /* multiple threads per row. */
52907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
53007e43b41SHong Zhang if (row < nrows) {
53107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
53207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
53307e43b41SHong Zhang
53407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
53507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
53607e43b41SHong Zhang __syncthreads();
537*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
53807e43b41SHong Zhang __syncthreads();
53907e43b41SHong Zhang if (threadIdx.y < 1) {
54007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
54107e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x];
54207e43b41SHong Zhang }
54307e43b41SHong Zhang }
54407e43b41SHong Zhang }
54507e43b41SHong Zhang
matmultadd_seqsell_tiled_kernel2(PetscInt nrows,const PetscInt * acolidx,const MatScalar * aval,const PetscInt * sliidx,const PetscScalar * x,const PetscScalar * y,PetscScalar * z)54666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
54707e43b41SHong Zhang {
54807e43b41SHong Zhang __shared__ MatScalar shared[512];
54907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice;
55007e43b41SHong Zhang /* multiple threads per row. */
55107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x;
55207e43b41SHong Zhang if (row < nrows) {
55307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT;
55407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT;
55507e43b41SHong Zhang
55607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
55707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
55807e43b41SHong Zhang __syncthreads();
55907e43b41SHong Zhang if (threadIdx.y < 1) {
56007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
56107e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x];
56207e43b41SHong Zhang }
56307e43b41SHong Zhang }
5642d1451d4SHong Zhang }
5652d1451d4SHong Zhang
MatMult_SeqSELLCUDA(Mat A,Vec xx,Vec yy)56666976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
5672d1451d4SHong Zhang {
5682d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
5692d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
5702d1451d4SHong Zhang PetscScalar *y;
5712d1451d4SHong Zhang const PetscScalar *x;
572a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
5732d1451d4SHong Zhang MatScalar *aval;
5742d1451d4SHong Zhang PetscInt *acolidx;
5752d1451d4SHong Zhang PetscInt *sliidx;
57607e43b41SHong Zhang PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
57707e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
57816a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX)
57916a9b8deSJose E. Roman PetscInt chunksperblock, nchunks, *chunk_slice_map;
580b921024eSHong Zhang PetscReal maxoveravg;
58116a9b8deSJose E. Roman #endif
5822d1451d4SHong Zhang
5832d1451d4SHong Zhang PetscFunctionBegin;
5844e58db63SHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
58590d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
5862d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A));
5872d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
5882d1451d4SHong Zhang aval = cudastruct->val;
5892d1451d4SHong Zhang acolidx = cudastruct->colidx;
5902d1451d4SHong Zhang sliidx = cudastruct->sliidx;
5912d1451d4SHong Zhang
5922d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x));
5932d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y));
5942d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin());
59507e43b41SHong Zhang
59607e43b41SHong Zhang switch (cudastruct->kernelchoice) {
5978711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
59807e43b41SHong Zhang case 9:
5994e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
6004e58db63SHong Zhang if (cudastruct->blocky == 2) {
6014e58db63SHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6024e58db63SHong Zhang } else if (cudastruct->blocky == 4) {
6034e58db63SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6044e58db63SHong Zhang } else if (cudastruct->blocky == 8) {
6054e58db63SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6064e58db63SHong Zhang } else if (cudastruct->blocky == 16) {
6074e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6084e58db63SHong Zhang } else if (cudastruct->blocky == 32) {
6094e58db63SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610cca9ff8bSHong Zhang } else {
611cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
61207e43b41SHong Zhang }
61307e43b41SHong Zhang break;
61407e43b41SHong Zhang case 7:
6154e58db63SHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight);
6164e58db63SHong Zhang if (cudastruct->blocky == 2) {
6174e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6184e58db63SHong Zhang } else if (cudastruct->blocky == 4) {
6194e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6204e58db63SHong Zhang } else if (cudastruct->blocky == 8) {
6214e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6224e58db63SHong Zhang } else if (cudastruct->blocky == 16) {
6234e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6244e58db63SHong Zhang } else if (cudastruct->blocky == 32) {
6254e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6264e58db63SHong Zhang } else {
6274e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6284e58db63SHong Zhang }
62907e43b41SHong Zhang break;
6308711c661SHong Zhang #endif
63107e43b41SHong Zhang case 6:
63207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
633a9dd396cSHong Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
63407e43b41SHong Zhang break;
63507e43b41SHong Zhang case 5:
63607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
637a9dd396cSHong Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
63807e43b41SHong Zhang break;
63907e43b41SHong Zhang case 4:
64007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
641a9dd396cSHong Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
64207e43b41SHong Zhang break;
64307e43b41SHong Zhang case 3:
64407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
645a9dd396cSHong Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
64607e43b41SHong Zhang break;
64707e43b41SHong Zhang case 2: /* 16 slices per block if blocksize=512 */
64807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2);
649a9dd396cSHong Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
65007e43b41SHong Zhang break;
65107e43b41SHong Zhang case 1: /* 32 slices per block if blocksize=512 */
65207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize;
6534e58db63SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
65407e43b41SHong Zhang break;
6558711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
65607e43b41SHong Zhang case 0:
65790d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth;
65890d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
65990d2215bSHong Zhang /* each block handles approximately one slice */
66090d2215bSHong Zhang PetscInt blocky = a->chunksize / 32;
66190d2215bSHong Zhang nchunks = cudastruct->totalchunks;
66290d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
66390d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock;
66490d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map;
66590d2215bSHong Zhang if (blocky == 2) {
66690d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
66790d2215bSHong Zhang } else if (blocky == 4) {
66890d2215bSHong Zhang matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
66990d2215bSHong Zhang } else if (blocky == 8) {
67090d2215bSHong Zhang matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67190d2215bSHong Zhang } else if (blocky == 16) {
67290d2215bSHong Zhang matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67390d2215bSHong Zhang } else if (blocky == 32) {
67490d2215bSHong Zhang matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67590d2215bSHong Zhang } else {
67690d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
67790d2215bSHong Zhang }
6782d1451d4SHong Zhang } else {
679cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth;
68090d2215bSHong Zhang if (avgslicesize <= 432) {
68190d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
68490d2215bSHong Zhang } else {
685cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
686cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
68790d2215bSHong Zhang }
688cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) {
689cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
690cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691cca9ff8bSHong Zhang } else {
6924e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
6934e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
6942d1451d4SHong Zhang }
6952d1451d4SHong Zhang }
696cca9ff8bSHong Zhang break;
6978711c661SHong Zhang #endif
6988711c661SHong Zhang default:
6998711c661SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
700cca9ff8bSHong Zhang }
7012d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd());
7022d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x));
7032d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y));
7042d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
7052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
7062d1451d4SHong Zhang }
7072d1451d4SHong Zhang
MatMultAdd_SeqSELLCUDA(Mat A,Vec xx,Vec yy,Vec zz)70866976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
7092d1451d4SHong Zhang {
7102d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
7112d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
7122d1451d4SHong Zhang PetscScalar *z;
7132d1451d4SHong Zhang const PetscScalar *y, *x;
714a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
7152d1451d4SHong Zhang MatScalar *aval = cudastruct->val;
7162d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx;
7172d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx;
71816a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX)
71990d2215bSHong Zhang PetscReal maxoveravg;
72016a9b8deSJose E. Roman PetscInt chunksperblock, nchunks, *chunk_slice_map;
72116a9b8deSJose E. Roman PetscInt blocky = cudastruct->blocky;
72216a9b8deSJose E. Roman #endif
7232d1451d4SHong Zhang
7242d1451d4SHong Zhang PetscFunctionBegin;
72590d2215bSHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
72690d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
7272d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A));
7282d1451d4SHong Zhang if (a->nz) {
72916a9b8deSJose E. Roman PetscInt nblocks, blocksize = 512;
73007e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
7312d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x));
7322d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y));
7332d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z));
7342d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin());
73507e43b41SHong Zhang
73607e43b41SHong Zhang switch (cudastruct->kernelchoice) {
7378711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
738cca9ff8bSHong Zhang case 9:
739cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
74090d2215bSHong Zhang if (blocky == 2) {
741cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74290d2215bSHong Zhang } else if (blocky == 4) {
743cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74490d2215bSHong Zhang } else if (blocky == 8) {
745cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74690d2215bSHong Zhang } else if (blocky == 16) {
747cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
74890d2215bSHong Zhang } else if (blocky == 32) {
749cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750cca9ff8bSHong Zhang } else {
751cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752cca9ff8bSHong Zhang }
753cca9ff8bSHong Zhang break;
75490d2215bSHong Zhang case 8:
75590d2215bSHong Zhang /* each block handles approximately one slice */
75690d2215bSHong Zhang nchunks = cudastruct->totalchunks;
75790d2215bSHong Zhang blocky = a->chunksize / 32;
75890d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
75990d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock;
76090d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map;
76190d2215bSHong Zhang if (blocky == 2) {
76290d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76390d2215bSHong Zhang } else if (blocky == 4) {
76490d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76590d2215bSHong Zhang } else if (blocky == 8) {
76690d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76790d2215bSHong Zhang } else if (blocky == 16) {
76890d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
76990d2215bSHong Zhang } else if (blocky == 32) {
77090d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
77190d2215bSHong Zhang } else {
77290d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
77390d2215bSHong Zhang }
77490d2215bSHong Zhang break;
775cca9ff8bSHong Zhang case 7:
776cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight);
77790d2215bSHong Zhang if (blocky == 2) {
778cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
77990d2215bSHong Zhang } else if (blocky == 4) {
780cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78190d2215bSHong Zhang } else if (blocky == 8) {
782cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78390d2215bSHong Zhang } else if (blocky == 16) {
784cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
78590d2215bSHong Zhang } else if (blocky == 32) {
786cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787cca9ff8bSHong Zhang } else {
788cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789cca9ff8bSHong Zhang }
790cca9ff8bSHong Zhang break;
7918711c661SHong Zhang #endif
79207e43b41SHong Zhang case 6:
79307e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32);
794a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
79507e43b41SHong Zhang break;
79607e43b41SHong Zhang case 5:
79707e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16);
798a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
79907e43b41SHong Zhang break;
80007e43b41SHong Zhang case 4:
80107e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8);
802a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
80307e43b41SHong Zhang break;
80407e43b41SHong Zhang case 3:
80507e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4);
806a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
80707e43b41SHong Zhang break;
80807e43b41SHong Zhang case 2:
80907e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2);
810a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
81107e43b41SHong Zhang break;
81207e43b41SHong Zhang case 1:
81307e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize;
8144e58db63SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
81507e43b41SHong Zhang break;
8168711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX)
817cca9ff8bSHong Zhang case 0:
81890d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth;
81990d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
82090d2215bSHong Zhang /* each block handles approximately one slice */
82190d2215bSHong Zhang nchunks = cudastruct->totalchunks;
82290d2215bSHong Zhang blocky = a->chunksize / 32;
82390d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
82490d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock;
82590d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map;
82690d2215bSHong Zhang if (blocky == 2) {
82790d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
82890d2215bSHong Zhang } else if (blocky == 4) {
82990d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83090d2215bSHong Zhang } else if (blocky == 8) {
83190d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83290d2215bSHong Zhang } else if (blocky == 16) {
83390d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83490d2215bSHong Zhang } else if (blocky == 32) {
83590d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83690d2215bSHong Zhang } else {
83790d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
83890d2215bSHong Zhang }
839cca9ff8bSHong Zhang } else {
840cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth;
84190d2215bSHong Zhang if (avgslicesize <= 432) {
84290d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
843cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
844cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
84590d2215bSHong Zhang } else {
846cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
847cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
84890d2215bSHong Zhang }
849cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) {
850cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
851cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
852cca9ff8bSHong Zhang } else {
853cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight;
854cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
855cca9ff8bSHong Zhang }
856cca9ff8bSHong Zhang }
85707e43b41SHong Zhang break;
8588711c661SHong Zhang #endif
8598711c661SHong Zhang default:
8608711c661SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
8612d1451d4SHong Zhang }
8622d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd());
8632d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x));
8642d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y));
8652d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z));
8662d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz));
8672d1451d4SHong Zhang } else {
8682d1451d4SHong Zhang PetscCall(VecCopy(yy, zz));
8692d1451d4SHong Zhang }
8702d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
8712d1451d4SHong Zhang }
8722d1451d4SHong Zhang
MatSetFromOptions_SeqSELLCUDA(Mat A,PetscOptionItems PetscOptionsObject)873ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject)
8742d1451d4SHong Zhang {
87507e43b41SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
8764e58db63SHong Zhang PetscInt kernel, blocky;
87707e43b41SHong Zhang PetscBool flg;
87807e43b41SHong Zhang
8792d1451d4SHong Zhang PetscFunctionBegin;
8802d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
88190d2215bSHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
88290d2215bSHong Zhang if (flg) {
88390d2215bSHong Zhang PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
88490d2215bSHong Zhang cudastruct->blocky = blocky;
88590d2215bSHong Zhang }
88607e43b41SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
88707e43b41SHong Zhang if (flg) {
88807e43b41SHong Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
88907e43b41SHong Zhang cudastruct->kernelchoice = kernel;
8903a7d0413SPierre Jolivet if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg));
8914e58db63SHong Zhang }
8922d1451d4SHong Zhang PetscOptionsHeadEnd();
8932d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
8942d1451d4SHong Zhang }
8952d1451d4SHong Zhang
MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)89607e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
89707e43b41SHong Zhang {
89807e43b41SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
89907e43b41SHong Zhang
90090d2215bSHong Zhang PetscFunctionBegin;
90107e43b41SHong Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
90207e43b41SHong Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
90307e43b41SHong Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
90407e43b41SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
90507e43b41SHong Zhang }
90607e43b41SHong Zhang
MatAssemblyEnd_SeqSELLCUDA(Mat A,MatAssemblyType mode)9072d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
9082d1451d4SHong Zhang {
9092d1451d4SHong Zhang PetscFunctionBegin;
9102d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
91107e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
9122d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
9133a7d0413SPierre Jolivet if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A));
9142d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA;
9152d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA;
9162d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
9172d1451d4SHong Zhang }
9182d1451d4SHong Zhang
MatZeroEntries_SeqSELLCUDA(Mat A)9198df136f9SHong Zhang static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
9208df136f9SHong Zhang {
9218df136f9SHong Zhang PetscBool both = PETSC_FALSE;
9228df136f9SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
9238df136f9SHong Zhang
9248df136f9SHong Zhang PetscFunctionBegin;
9258df136f9SHong Zhang if (A->factortype == MAT_FACTOR_NONE) {
9268df136f9SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
9278df136f9SHong Zhang if (cudastruct->val) {
9288df136f9SHong Zhang both = PETSC_TRUE;
929f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
9308df136f9SHong Zhang }
9318df136f9SHong Zhang }
9328df136f9SHong Zhang PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
9338df136f9SHong Zhang if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
9348df136f9SHong Zhang else A->offloadmask = PETSC_OFFLOAD_CPU;
9358df136f9SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
9368df136f9SHong Zhang }
9378df136f9SHong Zhang
MatDestroy_SeqSELLCUDA(Mat A)9382d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
9392d1451d4SHong Zhang {
9402d1451d4SHong Zhang PetscFunctionBegin;
9418df136f9SHong Zhang if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
9422d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A));
9432d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
9442d1451d4SHong Zhang }
9452d1451d4SHong Zhang
9462d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
MatDuplicate_SeqSELLCUDA(Mat A,MatDuplicateOption cpvalues,Mat * B)9472d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
9482d1451d4SHong Zhang {
9492d1451d4SHong Zhang PetscFunctionBegin;
9502d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
9512d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
9522d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
9532d1451d4SHong Zhang }
9542d1451d4SHong Zhang
MatConvert_SeqSELL_SeqSELLCUDA(Mat B)9558df136f9SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
9562d1451d4SHong Zhang {
9572d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct;
9582d1451d4SHong Zhang
9592d1451d4SHong Zhang PetscFunctionBegin;
9602d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype));
9612d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
9622d1451d4SHong Zhang
9632d1451d4SHong Zhang if (!B->spptr) {
9642d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) {
9652d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct));
9662d1451d4SHong Zhang B->spptr = cudastruct;
9672d1451d4SHong Zhang }
9682d1451d4SHong Zhang }
9692d1451d4SHong Zhang
9702d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA;
9712d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA;
9722d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
9732d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA;
9742d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA;
9752d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA;
9768df136f9SHong Zhang B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA;
9772d1451d4SHong Zhang
97807e43b41SHong Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
97907e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
98007e43b41SHong Zhang
9812d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
9822d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
9832d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
9842d1451d4SHong Zhang }
9852d1451d4SHong Zhang
986773bf0f6SHong Zhang /*MC
987773bf0f6SHong Zhang MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
988773bf0f6SHong Zhang
989773bf0f6SHong Zhang Options Database Keys:
990773bf0f6SHong Zhang + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
991773bf0f6SHong Zhang . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
992773bf0f6SHong Zhang - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
993773bf0f6SHong Zhang
994773bf0f6SHong Zhang Level: beginner
995773bf0f6SHong Zhang
996773bf0f6SHong Zhang .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
997773bf0f6SHong Zhang M*/
998773bf0f6SHong Zhang
MatCreate_SeqSELLCUDA(Mat B)9992d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
10002d1451d4SHong Zhang {
10012d1451d4SHong Zhang PetscFunctionBegin;
10022d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B));
10032d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
10048df136f9SHong Zhang PetscCall(MatSetFromOptions(B));
10052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS);
10062d1451d4SHong Zhang }
1007