12d1451d4SHong Zhang #include <cuda_runtime.h> 22d1451d4SHong Zhang 32d1451d4SHong Zhang #include <petscdevice_cuda.h> 42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 52d1451d4SHong Zhang 607e43b41SHong Zhang #define SLICE_HEIGHT 16 707e43b41SHong Zhang 82d1451d4SHong Zhang typedef struct { 92d1451d4SHong Zhang PetscInt *colidx; /* column index */ 102d1451d4SHong Zhang MatScalar *val; 112d1451d4SHong Zhang PetscInt *sliidx; 122d1451d4SHong Zhang PetscInt nonzerostate; 1307e43b41SHong Zhang PetscInt kernelchoice; 144e58db63SHong Zhang PetscInt blocky; 152d1451d4SHong Zhang } Mat_SeqSELLCUDA; 162d1451d4SHong Zhang 172d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 182d1451d4SHong Zhang { 192d1451d4SHong Zhang PetscFunctionBegin; 202d1451d4SHong Zhang if (*cudastruct) { 212d1451d4SHong Zhang if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 222d1451d4SHong Zhang if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 232d1451d4SHong Zhang if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 242d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct)); 252d1451d4SHong Zhang } 262d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 272d1451d4SHong Zhang } 282d1451d4SHong Zhang 292d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 302d1451d4SHong Zhang { 312d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 322d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 332d1451d4SHong Zhang 342d1451d4SHong Zhang PetscFunctionBegin; 352d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 362d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 372d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 382d1451d4SHong Zhang /* copy values only */ 392d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 402d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 412d1451d4SHong Zhang } else { 422d1451d4SHong Zhang if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 432d1451d4SHong Zhang if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 442d1451d4SHong Zhang if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 452d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 462d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 472d1451d4SHong Zhang /* copy values, nz or maxallocmat? */ 482d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 492d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 5007e43b41SHong Zhang 5107e43b41SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 5207e43b41SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 532d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 542d1451d4SHong Zhang cudastruct->nonzerostate = A->nonzerostate; 552d1451d4SHong Zhang } 562d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 572d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 582d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 592d1451d4SHong Zhang } 602d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 612d1451d4SHong Zhang } 622d1451d4SHong Zhang 634e58db63SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 642d1451d4SHong Zhang { 652d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 662d1451d4SHong Zhang MatScalar sum; 672d1451d4SHong Zhang /* one thread per row. */ 682d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 692d1451d4SHong Zhang if (row < nrows) { 704e58db63SHong Zhang slice_id = row / sliceheight; 714e58db63SHong Zhang row_in_slice = row % sliceheight; 722d1451d4SHong Zhang sum = 0.0; 734e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 742d1451d4SHong Zhang y[row] = sum; 752d1451d4SHong Zhang } 762d1451d4SHong Zhang } 772d1451d4SHong Zhang 784e58db63SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 792d1451d4SHong Zhang { 802d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 812d1451d4SHong Zhang MatScalar sum; 822d1451d4SHong Zhang /* one thread per row. */ 832d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 842d1451d4SHong Zhang if (row < nrows) { 854e58db63SHong Zhang slice_id = row / sliceheight; 864e58db63SHong Zhang row_in_slice = row % sliceheight; 872d1451d4SHong Zhang sum = 0.0; 884e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 892d1451d4SHong Zhang z[row] = y[row] + sum; 902d1451d4SHong Zhang } 912d1451d4SHong Zhang } 9207e43b41SHong Zhang 9307e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */ 9407e43b41SHong Zhang template <int BLOCKY> 954e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 9607e43b41SHong Zhang { 974e58db63SHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 984e58db63SHong Zhang PetscInt i, row, slice_id = blockIdx.x; 994e58db63SHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 10007e43b41SHong Zhang /* transposed index */ 10107e43b41SHong Zhang int tidx = tid % BLOCKY; 10207e43b41SHong Zhang int tidy = tid / BLOCKY; 10307e43b41SHong Zhang PetscScalar t = 0.0; 1044e58db63SHong Zhang 1054e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 10607e43b41SHong Zhang if (row < nrows) { 1074e58db63SHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 1082d1451d4SHong Zhang } 1094e58db63SHong Zhang #pragma unroll 1104e58db63SHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 11107e43b41SHong Zhang /* transpose layout to reduce each row using warp shfl */ 11207e43b41SHong Zhang shared[threadIdx.x][threadIdx.y] = t; 11307e43b41SHong Zhang __syncthreads(); 11407e43b41SHong Zhang t = shared[tidy][tidx]; 11507e43b41SHong Zhang #pragma unroll 11607e43b41SHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 1174e58db63SHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 11807e43b41SHong Zhang __syncthreads(); 1194e58db63SHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 1202d1451d4SHong Zhang } 1212d1451d4SHong Zhang 122*cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */ 123*cca9ff8bSHong Zhang template <int BLOCKY> 124*cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 125*cca9ff8bSHong Zhang { 126*cca9ff8bSHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 127*cca9ff8bSHong Zhang PetscInt i, row, slice_id = blockIdx.x; 128*cca9ff8bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 129*cca9ff8bSHong Zhang /* transposed index */ 130*cca9ff8bSHong Zhang int tidx = tid % BLOCKY; 131*cca9ff8bSHong Zhang int tidy = tid / BLOCKY; 132*cca9ff8bSHong Zhang PetscScalar t = 0.0; 133*cca9ff8bSHong Zhang 134*cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 135*cca9ff8bSHong Zhang if (row < nrows) { 136*cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 137*cca9ff8bSHong Zhang } 138*cca9ff8bSHong Zhang #pragma unroll 139*cca9ff8bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 140*cca9ff8bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 141*cca9ff8bSHong Zhang shared[threadIdx.x][threadIdx.y] = t; 142*cca9ff8bSHong Zhang __syncthreads(); 143*cca9ff8bSHong Zhang t = shared[tidy][tidx]; 144*cca9ff8bSHong Zhang #pragma unroll 145*cca9ff8bSHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 146*cca9ff8bSHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 147*cca9ff8bSHong Zhang __syncthreads(); 148*cca9ff8bSHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 149*cca9ff8bSHong Zhang } 150*cca9ff8bSHong Zhang 15107e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */ 1524e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 1532d1451d4SHong Zhang { 15407e43b41SHong Zhang PetscInt i, row, slice_id; 15507e43b41SHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 1564e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 15707e43b41SHong Zhang double t = 0.0; 15807e43b41SHong Zhang if (row < nrows) { 15907e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 16007e43b41SHong Zhang } 1614e58db63SHong Zhang #pragma unroll 1624e58db63SHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 1634e58db63SHong Zhang if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 16407e43b41SHong Zhang } 16507e43b41SHong Zhang 166*cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */ 167*cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 168*cca9ff8bSHong Zhang { 169*cca9ff8bSHong Zhang PetscInt i, row, slice_id; 170*cca9ff8bSHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 171*cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 172*cca9ff8bSHong Zhang double t = 0.0; 173*cca9ff8bSHong Zhang if (row < nrows) { 174*cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 175*cca9ff8bSHong Zhang } 176*cca9ff8bSHong Zhang #pragma unroll 177*cca9ff8bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 178*cca9ff8bSHong Zhang if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 179*cca9ff8bSHong Zhang } 180*cca9ff8bSHong Zhang 181a9dd396cSHong Zhang /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 182a9dd396cSHong Zhang 183a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 18407e43b41SHong Zhang { 18507e43b41SHong Zhang __shared__ MatScalar shared[512]; 1862d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 18707e43b41SHong Zhang /* multiple threads per row. */ 1882d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 1892d1451d4SHong Zhang if (row < nrows) { 1902d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 1912d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 1922d1451d4SHong Zhang 1932d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 1942d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 19507e43b41SHong Zhang __syncthreads(); 19607e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 19707e43b41SHong Zhang __syncthreads(); 19807e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 1992d1451d4SHong Zhang __syncthreads(); 2002d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 2012d1451d4SHong Zhang __syncthreads(); 2022d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 2032d1451d4SHong Zhang __syncthreads(); 2042d1451d4SHong Zhang if (threadIdx.y < 1) { 20507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 2062d1451d4SHong Zhang y[row] = shared[threadIdx.x]; 2072d1451d4SHong Zhang } 2082d1451d4SHong Zhang } 2092d1451d4SHong Zhang } 2102d1451d4SHong Zhang 211a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 2122d1451d4SHong Zhang { 21307e43b41SHong Zhang __shared__ MatScalar shared[512]; 2142d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 21507e43b41SHong Zhang /* multiple threads per row. */ 2162d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 2172d1451d4SHong Zhang if (row < nrows) { 2182d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 2192d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 2202d1451d4SHong Zhang 2212d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 2222d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 22307e43b41SHong Zhang __syncthreads(); 22407e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 2252d1451d4SHong Zhang __syncthreads(); 2262d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 2272d1451d4SHong Zhang __syncthreads(); 2282d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 2292d1451d4SHong Zhang __syncthreads(); 2302d1451d4SHong Zhang if (threadIdx.y < 1) { 23107e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 23207e43b41SHong Zhang y[row] = shared[threadIdx.x]; 23307e43b41SHong Zhang } 23407e43b41SHong Zhang } 23507e43b41SHong Zhang } 23607e43b41SHong Zhang 237a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 23807e43b41SHong Zhang { 23907e43b41SHong Zhang __shared__ MatScalar shared[512]; 24007e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 24107e43b41SHong Zhang /* multiple threads per row. */ 24207e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 24307e43b41SHong Zhang if (row < nrows) { 24407e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 24507e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 24607e43b41SHong Zhang 24707e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 24807e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 24907e43b41SHong Zhang __syncthreads(); 25007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 25107e43b41SHong Zhang __syncthreads(); 25207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 25307e43b41SHong Zhang __syncthreads(); 25407e43b41SHong Zhang if (threadIdx.y < 1) { 25507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 25607e43b41SHong Zhang y[row] = shared[threadIdx.x]; 25707e43b41SHong Zhang } 25807e43b41SHong Zhang } 25907e43b41SHong Zhang } 26007e43b41SHong Zhang 261a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 26207e43b41SHong Zhang { 26307e43b41SHong Zhang __shared__ MatScalar shared[512]; 26407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 26507e43b41SHong Zhang /* multiple threads per row. */ 26607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 26707e43b41SHong Zhang if (row < nrows) { 26807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 26907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 27007e43b41SHong Zhang 27107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 27207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 27307e43b41SHong Zhang __syncthreads(); 27407e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 27507e43b41SHong Zhang __syncthreads(); 27607e43b41SHong Zhang if (threadIdx.y < 1) { 27707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 27807e43b41SHong Zhang y[row] = shared[threadIdx.x]; 27907e43b41SHong Zhang } 28007e43b41SHong Zhang } 28107e43b41SHong Zhang } 28207e43b41SHong Zhang 283a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 28407e43b41SHong Zhang { 28507e43b41SHong Zhang __shared__ MatScalar shared[512]; 28607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 28707e43b41SHong Zhang /* multiple threads per row. */ 28807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 28907e43b41SHong Zhang if (row < nrows) { 29007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 29107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 29207e43b41SHong Zhang 29307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 29407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 29507e43b41SHong Zhang __syncthreads(); 29607e43b41SHong Zhang if (threadIdx.y < 1) { 29707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 29807e43b41SHong Zhang y[row] = shared[threadIdx.x]; 29907e43b41SHong Zhang } 30007e43b41SHong Zhang } 30107e43b41SHong Zhang } 30207e43b41SHong Zhang 303a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 30407e43b41SHong Zhang { 30507e43b41SHong Zhang __shared__ MatScalar shared[512]; 30607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 30707e43b41SHong Zhang /* multiple threads per row. */ 30807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 30907e43b41SHong Zhang if (row < nrows) { 31007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 31107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 31207e43b41SHong Zhang 31307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 31407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 31507e43b41SHong Zhang __syncthreads(); 31607e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 31707e43b41SHong Zhang __syncthreads(); 31807e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 31907e43b41SHong Zhang __syncthreads(); 32007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 32107e43b41SHong Zhang __syncthreads(); 32207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 32307e43b41SHong Zhang __syncthreads(); 32407e43b41SHong Zhang if (threadIdx.y < 1) { 32507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 3262d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 3272d1451d4SHong Zhang } 3282d1451d4SHong Zhang } 3292d1451d4SHong Zhang } 33007e43b41SHong Zhang 331a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 33207e43b41SHong Zhang { 33307e43b41SHong Zhang __shared__ MatScalar shared[512]; 33407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 33507e43b41SHong Zhang /* multiple threads per row. */ 33607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 33707e43b41SHong Zhang if (row < nrows) { 33807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 33907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 34007e43b41SHong Zhang 34107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 34207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 34307e43b41SHong Zhang __syncthreads(); 34407e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 34507e43b41SHong Zhang __syncthreads(); 34607e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 34707e43b41SHong Zhang __syncthreads(); 34807e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 34907e43b41SHong Zhang __syncthreads(); 35007e43b41SHong Zhang if (threadIdx.y < 1) { 35107e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 35207e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 35307e43b41SHong Zhang } 35407e43b41SHong Zhang } 35507e43b41SHong Zhang } 35607e43b41SHong Zhang 357a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 35807e43b41SHong Zhang { 35907e43b41SHong Zhang __shared__ MatScalar shared[512]; 36007e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 36107e43b41SHong Zhang /* multiple threads per row. */ 36207e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 36307e43b41SHong Zhang if (row < nrows) { 36407e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 36507e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 36607e43b41SHong Zhang 36707e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 36807e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 36907e43b41SHong Zhang __syncthreads(); 37007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 37107e43b41SHong Zhang __syncthreads(); 37207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 37307e43b41SHong Zhang __syncthreads(); 37407e43b41SHong Zhang if (threadIdx.y < 1) { 37507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 37607e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 37707e43b41SHong Zhang } 37807e43b41SHong Zhang } 37907e43b41SHong Zhang } 38007e43b41SHong Zhang 381a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 38207e43b41SHong Zhang { 38307e43b41SHong Zhang __shared__ MatScalar shared[512]; 38407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 38507e43b41SHong Zhang /* multiple threads per row. */ 38607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 38707e43b41SHong Zhang if (row < nrows) { 38807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 38907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 39007e43b41SHong Zhang 39107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 39207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 39307e43b41SHong Zhang __syncthreads(); 39407e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 39507e43b41SHong Zhang __syncthreads(); 39607e43b41SHong Zhang if (threadIdx.y < 1) { 39707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 39807e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 39907e43b41SHong Zhang } 40007e43b41SHong Zhang } 40107e43b41SHong Zhang } 40207e43b41SHong Zhang 403a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 40407e43b41SHong Zhang { 40507e43b41SHong Zhang __shared__ MatScalar shared[512]; 40607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 40707e43b41SHong Zhang /* multiple threads per row. */ 40807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 40907e43b41SHong Zhang if (row < nrows) { 41007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 41107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 41207e43b41SHong Zhang 41307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 41407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 41507e43b41SHong Zhang __syncthreads(); 41607e43b41SHong Zhang if (threadIdx.y < 1) { 41707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 41807e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 41907e43b41SHong Zhang } 42007e43b41SHong Zhang } 4212d1451d4SHong Zhang } 4222d1451d4SHong Zhang 4232d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 4242d1451d4SHong Zhang { 4252d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 4262d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 4272d1451d4SHong Zhang PetscScalar *y; 4282d1451d4SHong Zhang const PetscScalar *x; 429a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 4302d1451d4SHong Zhang MatScalar *aval; 4312d1451d4SHong Zhang PetscInt *acolidx; 4322d1451d4SHong Zhang PetscInt *sliidx; 43307e43b41SHong Zhang PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 43407e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 4352d1451d4SHong Zhang 4362d1451d4SHong Zhang PetscFunctionBegin; 4374e58db63SHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 4382d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 4392d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 4402d1451d4SHong Zhang aval = cudastruct->val; 4412d1451d4SHong Zhang acolidx = cudastruct->colidx; 4422d1451d4SHong Zhang sliidx = cudastruct->sliidx; 4432d1451d4SHong Zhang 4442d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 4452d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y)); 4462d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 44707e43b41SHong Zhang 44807e43b41SHong Zhang switch (cudastruct->kernelchoice) { 44907e43b41SHong Zhang case 9: 4504e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 4514e58db63SHong Zhang if (cudastruct->blocky == 2) { 4524e58db63SHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4534e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 4544e58db63SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4554e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 4564e58db63SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4574e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 4584e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4594e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 4604e58db63SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 461*cca9ff8bSHong Zhang } else { 462*cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 46307e43b41SHong Zhang } 46407e43b41SHong Zhang break; 46507e43b41SHong Zhang case 7: 4664e58db63SHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 4674e58db63SHong Zhang if (cudastruct->blocky == 2) { 4684e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4694e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 4704e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4714e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 4724e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4734e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 4744e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4754e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 4764e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4774e58db63SHong Zhang } else { 4784e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 4794e58db63SHong Zhang } 48007e43b41SHong Zhang break; 48107e43b41SHong Zhang case 6: 48207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 483a9dd396cSHong Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 48407e43b41SHong Zhang break; 48507e43b41SHong Zhang case 5: 48607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 487a9dd396cSHong Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 48807e43b41SHong Zhang break; 48907e43b41SHong Zhang case 4: 49007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 491a9dd396cSHong Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 49207e43b41SHong Zhang break; 49307e43b41SHong Zhang case 3: 49407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 495a9dd396cSHong Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 49607e43b41SHong Zhang break; 49707e43b41SHong Zhang case 2: /* 16 slices per block if blocksize=512 */ 49807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 499a9dd396cSHong Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 50007e43b41SHong Zhang break; 50107e43b41SHong Zhang case 1: /* 32 slices per block if blocksize=512 */ 50207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 5034e58db63SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 50407e43b41SHong Zhang break; 50507e43b41SHong Zhang case 0: 5064e58db63SHong Zhang if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */ 5074e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 5084e58db63SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 5092d1451d4SHong Zhang } else { 510*cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 511*cca9ff8bSHong Zhang if (avgslicesize <= 96) { 512*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 513*cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 514*cca9ff8bSHong Zhang } else if (avgslicesize <= 432) { 515*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 516*cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 517*cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 518*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 519*cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 520*cca9ff8bSHong Zhang } else { 5214e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 5224e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 5232d1451d4SHong Zhang } 5242d1451d4SHong Zhang } 525*cca9ff8bSHong Zhang break; 526*cca9ff8bSHong Zhang } 5272d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 5282d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 5292d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 5302d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 5312d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5322d1451d4SHong Zhang } 5332d1451d4SHong Zhang 5342d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 5352d1451d4SHong Zhang { 5362d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 5372d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 5382d1451d4SHong Zhang PetscScalar *z; 5392d1451d4SHong Zhang const PetscScalar *y, *x; 540a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 5412d1451d4SHong Zhang MatScalar *aval = cudastruct->val; 5422d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx; 5432d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx; 5442d1451d4SHong Zhang 5452d1451d4SHong Zhang PetscFunctionBegin; 5464e58db63SHong Zhang PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 5472d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 5482d1451d4SHong Zhang if (a->nz) { 54907e43b41SHong Zhang PetscInt nblocks, blocksize = 512; 55007e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 5512d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 5522d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y)); 5532d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z)); 5542d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 55507e43b41SHong Zhang 55607e43b41SHong Zhang switch (cudastruct->kernelchoice) { 557*cca9ff8bSHong Zhang case 9: 558*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 559*cca9ff8bSHong Zhang if (cudastruct->blocky == 2) { 560*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 561*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 4) { 562*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 563*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 8) { 564*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 565*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 16) { 566*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 567*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 32) { 568*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 569*cca9ff8bSHong Zhang } else { 570*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 571*cca9ff8bSHong Zhang } 572*cca9ff8bSHong Zhang break; 573*cca9ff8bSHong Zhang case 7: 574*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 575*cca9ff8bSHong Zhang if (cudastruct->blocky == 2) { 576*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 577*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 4) { 578*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 579*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 8) { 580*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 581*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 16) { 582*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 583*cca9ff8bSHong Zhang } else if (cudastruct->blocky == 32) { 584*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 585*cca9ff8bSHong Zhang } else { 586*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 587*cca9ff8bSHong Zhang } 588*cca9ff8bSHong Zhang break; 58907e43b41SHong Zhang case 6: 59007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); 591a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 59207e43b41SHong Zhang break; 59307e43b41SHong Zhang case 5: 59407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); 595a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 59607e43b41SHong Zhang break; 59707e43b41SHong Zhang case 4: 59807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); 599a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 60007e43b41SHong Zhang break; 60107e43b41SHong Zhang case 3: 60207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); 603a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 60407e43b41SHong Zhang break; 60507e43b41SHong Zhang case 2: 60607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 607a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 60807e43b41SHong Zhang break; 60907e43b41SHong Zhang case 1: 61007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 6114e58db63SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 61207e43b41SHong Zhang break; 613*cca9ff8bSHong Zhang case 0: 614*cca9ff8bSHong Zhang if (sliceheight * a->maxslicewidth > 20800) { 615*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 616*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 617*cca9ff8bSHong Zhang } else { 618*cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 619*cca9ff8bSHong Zhang if (avgslicesize <= 96) { 620*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 621*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 622*cca9ff8bSHong Zhang } else if (avgslicesize <= 432) { 623*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 624*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 625*cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 626*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 627*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 628*cca9ff8bSHong Zhang } else { 629*cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 630*cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 631*cca9ff8bSHong Zhang } 632*cca9ff8bSHong Zhang } 63307e43b41SHong Zhang break; 6342d1451d4SHong Zhang } 6352d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 6362d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 6372d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y)); 6382d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 6392d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 6402d1451d4SHong Zhang } else { 6412d1451d4SHong Zhang PetscCall(VecCopy(yy, zz)); 6422d1451d4SHong Zhang } 6432d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6442d1451d4SHong Zhang } 6452d1451d4SHong Zhang 6462d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 6472d1451d4SHong Zhang { 64807e43b41SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 6494e58db63SHong Zhang PetscInt kernel, blocky; 65007e43b41SHong Zhang PetscBool flg; 65107e43b41SHong Zhang 6522d1451d4SHong Zhang PetscFunctionBegin; 6532d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 65407e43b41SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 65507e43b41SHong Zhang if (flg) { 65607e43b41SHong Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 65707e43b41SHong Zhang cudastruct->kernelchoice = kernel; 65807e43b41SHong Zhang } 6594e58db63SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 6604e58db63SHong Zhang if (flg) { 6614e58db63SHong Zhang PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel); 6624e58db63SHong Zhang cudastruct->blocky = blocky; 6634e58db63SHong Zhang } 6642d1451d4SHong Zhang PetscOptionsHeadEnd(); 6652d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6662d1451d4SHong Zhang } 6672d1451d4SHong Zhang 66807e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 66907e43b41SHong Zhang { 67007e43b41SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 67107e43b41SHong Zhang 67207e43b41SHong Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 67307e43b41SHong Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 67407e43b41SHong Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 67507e43b41SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 67607e43b41SHong Zhang } 67707e43b41SHong Zhang 6782d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 6792d1451d4SHong Zhang { 6802d1451d4SHong Zhang PetscFunctionBegin; 6812d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 68207e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 6832d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 6842d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 6852d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA; 6862d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA; 6872d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6882d1451d4SHong Zhang } 6892d1451d4SHong Zhang 6902d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 6912d1451d4SHong Zhang { 6922d1451d4SHong Zhang PetscFunctionBegin; 6932d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { 6942d1451d4SHong Zhang if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 6952d1451d4SHong Zhang } 6962d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A)); 6972d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6982d1451d4SHong Zhang } 6992d1451d4SHong Zhang 7002d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 7012d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 7022d1451d4SHong Zhang { 7032d1451d4SHong Zhang PetscFunctionBegin; 7042d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 7052d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 7062d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 7072d1451d4SHong Zhang } 7082d1451d4SHong Zhang 7092d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 7102d1451d4SHong Zhang { 7112d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct; 7122d1451d4SHong Zhang 7132d1451d4SHong Zhang PetscFunctionBegin; 7142d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype)); 7152d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 7162d1451d4SHong Zhang 7172d1451d4SHong Zhang if (!B->spptr) { 7182d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) { 7192d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct)); 7202d1451d4SHong Zhang B->spptr = cudastruct; 7212d1451d4SHong Zhang } 7222d1451d4SHong Zhang } 7232d1451d4SHong Zhang 7242d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 7252d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA; 7262d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 7272d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA; 7282d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA; 7292d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 7302d1451d4SHong Zhang 73107e43b41SHong Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 73207e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 73307e43b41SHong Zhang 7342d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 7352d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 7362d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 7372d1451d4SHong Zhang } 7382d1451d4SHong Zhang 7392d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 7402d1451d4SHong Zhang { 7412d1451d4SHong Zhang PetscFunctionBegin; 7422d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B)); 7432d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 7442d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 7452d1451d4SHong Zhang } 746