12d1451d4SHong Zhang #include <cuda_runtime.h> 22d1451d4SHong Zhang 32d1451d4SHong Zhang #include <petscdevice_cuda.h> 42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 52d1451d4SHong Zhang 6*07e43b41SHong Zhang #define SLICE_HEIGHT 16 7*07e43b41SHong Zhang 82d1451d4SHong Zhang typedef struct { 92d1451d4SHong Zhang PetscInt *colidx; /* column index */ 102d1451d4SHong Zhang MatScalar *val; 112d1451d4SHong Zhang PetscInt *sliidx; 122d1451d4SHong Zhang PetscInt nonzerostate; 13*07e43b41SHong Zhang PetscInt kernelchoice; 142d1451d4SHong Zhang } Mat_SeqSELLCUDA; 152d1451d4SHong Zhang 162d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 172d1451d4SHong Zhang { 182d1451d4SHong Zhang PetscFunctionBegin; 192d1451d4SHong Zhang if (*cudastruct) { 202d1451d4SHong Zhang if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 212d1451d4SHong Zhang if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 222d1451d4SHong Zhang if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 232d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct)); 242d1451d4SHong Zhang } 252d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 262d1451d4SHong Zhang } 272d1451d4SHong Zhang 282d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 292d1451d4SHong Zhang { 302d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 312d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 322d1451d4SHong Zhang 332d1451d4SHong Zhang PetscFunctionBegin; 342d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 352d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 362d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 372d1451d4SHong Zhang /* copy values only */ 382d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 392d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 402d1451d4SHong Zhang } else { 412d1451d4SHong Zhang if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 422d1451d4SHong Zhang if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 432d1451d4SHong Zhang if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 442d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 452d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 462d1451d4SHong Zhang /* copy values, nz or maxallocmat? */ 472d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 482d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 49*07e43b41SHong Zhang 50*07e43b41SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 51*07e43b41SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 522d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 532d1451d4SHong Zhang cudastruct->nonzerostate = A->nonzerostate; 542d1451d4SHong Zhang } 552d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 562d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 572d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 582d1451d4SHong Zhang } 592d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 602d1451d4SHong Zhang } 612d1451d4SHong Zhang 622d1451d4SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 632d1451d4SHong Zhang { 642d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 652d1451d4SHong Zhang MatScalar sum; 662d1451d4SHong Zhang /* one thread per row. */ 672d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 682d1451d4SHong Zhang if (row < nrows) { 692d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 702d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 712d1451d4SHong Zhang sum = 0.0; 722d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 732d1451d4SHong Zhang y[row] = sum; 742d1451d4SHong Zhang } 752d1451d4SHong Zhang } 762d1451d4SHong Zhang 772d1451d4SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 782d1451d4SHong Zhang { 792d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 802d1451d4SHong Zhang MatScalar sum; 812d1451d4SHong Zhang /* one thread per row. */ 822d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 832d1451d4SHong Zhang if (row < nrows) { 842d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 852d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 862d1451d4SHong Zhang sum = 0.0; 872d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 882d1451d4SHong Zhang z[row] = y[row] + sum; 892d1451d4SHong Zhang } 902d1451d4SHong Zhang } 91*07e43b41SHong Zhang 92*07e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */ 93*07e43b41SHong Zhang template <int BLOCKY> 94*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 95*07e43b41SHong Zhang { 96*07e43b41SHong Zhang __shared__ MatScalar shared[SLICE_HEIGHT][BLOCKY]; 97*07e43b41SHong Zhang PetscInt i, row, slice_id; 98*07e43b41SHong Zhang slice_id = blockIdx.x; 99*07e43b41SHong Zhang row = slice_id * SLICE_HEIGHT + threadIdx.x; 100*07e43b41SHong Zhang int tid = threadIdx.x + threadIdx.y * SLICE_HEIGHT; 101*07e43b41SHong Zhang /* transposed index */ 102*07e43b41SHong Zhang int tidx = tid % BLOCKY; 103*07e43b41SHong Zhang int tidy = tid / BLOCKY; 104*07e43b41SHong Zhang PetscScalar t = 0.0; 105*07e43b41SHong Zhang if (row < nrows) { 106*07e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * BLOCKY) t += aval[i] * x[acolidx[i]]; 1072d1451d4SHong Zhang } 108*07e43b41SHong Zhang /* transpose layout to reduce each row using warp shfl */ 109*07e43b41SHong Zhang shared[threadIdx.x][threadIdx.y] = t; 110*07e43b41SHong Zhang __syncthreads(); 111*07e43b41SHong Zhang t = shared[tidy][tidx]; 112*07e43b41SHong Zhang #pragma unroll 113*07e43b41SHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 114*07e43b41SHong Zhang __syncthreads(); 115*07e43b41SHong Zhang if (tidx == 0) { shared[0][tidy] = t; } 116*07e43b41SHong Zhang __syncthreads(); 117*07e43b41SHong Zhang if (row < nrows && threadIdx.y == 0) { y[row] = shared[0][threadIdx.x]; } 1182d1451d4SHong Zhang } 1192d1451d4SHong Zhang 120*07e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */ 121*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 1222d1451d4SHong Zhang { 123*07e43b41SHong Zhang PetscInt i, row, slice_id; 124*07e43b41SHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 125*07e43b41SHong Zhang row = slice_id * SLICE_HEIGHT + threadIdx.x % SLICE_HEIGHT; 126*07e43b41SHong Zhang double t = 0.0; 127*07e43b41SHong Zhang if (row < nrows) { 128*07e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 129*07e43b41SHong Zhang } 130*07e43b41SHong Zhang t += __shfl_down_sync(0xffffffff, t, 16); 131*07e43b41SHong Zhang if (row < nrows && threadIdx.x < 16) { y[row] = t; } 132*07e43b41SHong Zhang } 133*07e43b41SHong Zhang 134*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 135*07e43b41SHong Zhang { 136*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 1372d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 138*07e43b41SHong Zhang /* multiple threads per row. */ 1392d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 1402d1451d4SHong Zhang if (row < nrows) { 1412d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 1422d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 1432d1451d4SHong Zhang 1442d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 1452d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 146*07e43b41SHong Zhang __syncthreads(); 147*07e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 148*07e43b41SHong Zhang __syncthreads(); 149*07e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 1502d1451d4SHong Zhang __syncthreads(); 1512d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 1522d1451d4SHong Zhang __syncthreads(); 1532d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 1542d1451d4SHong Zhang __syncthreads(); 1552d1451d4SHong Zhang if (threadIdx.y < 1) { 156*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 1572d1451d4SHong Zhang y[row] = shared[threadIdx.x]; 1582d1451d4SHong Zhang } 1592d1451d4SHong Zhang } 1602d1451d4SHong Zhang } 1612d1451d4SHong Zhang 162*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 1632d1451d4SHong Zhang { 164*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 1652d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 166*07e43b41SHong Zhang /* multiple threads per row. */ 1672d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 1682d1451d4SHong Zhang if (row < nrows) { 1692d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 1702d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 1712d1451d4SHong Zhang 1722d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 1732d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 174*07e43b41SHong Zhang __syncthreads(); 175*07e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 1762d1451d4SHong Zhang __syncthreads(); 1772d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 1782d1451d4SHong Zhang __syncthreads(); 1792d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 1802d1451d4SHong Zhang __syncthreads(); 1812d1451d4SHong Zhang if (threadIdx.y < 1) { 182*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 183*07e43b41SHong Zhang y[row] = shared[threadIdx.x]; 184*07e43b41SHong Zhang } 185*07e43b41SHong Zhang } 186*07e43b41SHong Zhang } 187*07e43b41SHong Zhang 188*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 189*07e43b41SHong Zhang { 190*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 191*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 192*07e43b41SHong Zhang /* multiple threads per row. */ 193*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 194*07e43b41SHong Zhang if (row < nrows) { 195*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 196*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 197*07e43b41SHong Zhang 198*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 199*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 200*07e43b41SHong Zhang __syncthreads(); 201*07e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 202*07e43b41SHong Zhang __syncthreads(); 203*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 204*07e43b41SHong Zhang __syncthreads(); 205*07e43b41SHong Zhang if (threadIdx.y < 1) { 206*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 207*07e43b41SHong Zhang y[row] = shared[threadIdx.x]; 208*07e43b41SHong Zhang } 209*07e43b41SHong Zhang } 210*07e43b41SHong Zhang } 211*07e43b41SHong Zhang 212*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 213*07e43b41SHong Zhang { 214*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 215*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 216*07e43b41SHong Zhang /* multiple threads per row. */ 217*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 218*07e43b41SHong Zhang if (row < nrows) { 219*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 220*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 221*07e43b41SHong Zhang 222*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 223*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 224*07e43b41SHong Zhang __syncthreads(); 225*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 226*07e43b41SHong Zhang __syncthreads(); 227*07e43b41SHong Zhang if (threadIdx.y < 1) { 228*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 229*07e43b41SHong Zhang y[row] = shared[threadIdx.x]; 230*07e43b41SHong Zhang } 231*07e43b41SHong Zhang } 232*07e43b41SHong Zhang } 233*07e43b41SHong Zhang 234*07e43b41SHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 235*07e43b41SHong Zhang { 236*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 237*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 238*07e43b41SHong Zhang /* multiple threads per row. */ 239*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 240*07e43b41SHong Zhang if (row < nrows) { 241*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 242*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 243*07e43b41SHong Zhang 244*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 245*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 246*07e43b41SHong Zhang __syncthreads(); 247*07e43b41SHong Zhang if (threadIdx.y < 1) { 248*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 249*07e43b41SHong Zhang y[row] = shared[threadIdx.x]; 250*07e43b41SHong Zhang } 251*07e43b41SHong Zhang } 252*07e43b41SHong Zhang } 253*07e43b41SHong Zhang 254*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 255*07e43b41SHong Zhang { 256*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 257*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 258*07e43b41SHong Zhang /* multiple threads per row. */ 259*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 260*07e43b41SHong Zhang if (row < nrows) { 261*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 262*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 263*07e43b41SHong Zhang 264*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 265*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 266*07e43b41SHong Zhang __syncthreads(); 267*07e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 268*07e43b41SHong Zhang __syncthreads(); 269*07e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 270*07e43b41SHong Zhang __syncthreads(); 271*07e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 272*07e43b41SHong Zhang __syncthreads(); 273*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 274*07e43b41SHong Zhang __syncthreads(); 275*07e43b41SHong Zhang if (threadIdx.y < 1) { 276*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 2772d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 2782d1451d4SHong Zhang } 2792d1451d4SHong Zhang } 2802d1451d4SHong Zhang } 281*07e43b41SHong Zhang 282*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 283*07e43b41SHong Zhang { 284*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 285*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 286*07e43b41SHong Zhang /* multiple threads per row. */ 287*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 288*07e43b41SHong Zhang if (row < nrows) { 289*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 290*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 291*07e43b41SHong Zhang 292*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 293*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 294*07e43b41SHong Zhang __syncthreads(); 295*07e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 296*07e43b41SHong Zhang __syncthreads(); 297*07e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 298*07e43b41SHong Zhang __syncthreads(); 299*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 300*07e43b41SHong Zhang __syncthreads(); 301*07e43b41SHong Zhang if (threadIdx.y < 1) { 302*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 303*07e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 304*07e43b41SHong Zhang } 305*07e43b41SHong Zhang } 306*07e43b41SHong Zhang } 307*07e43b41SHong Zhang 308*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 309*07e43b41SHong Zhang { 310*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 311*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 312*07e43b41SHong Zhang /* multiple threads per row. */ 313*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 314*07e43b41SHong Zhang if (row < nrows) { 315*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 316*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 317*07e43b41SHong Zhang 318*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 319*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 320*07e43b41SHong Zhang __syncthreads(); 321*07e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 322*07e43b41SHong Zhang __syncthreads(); 323*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 324*07e43b41SHong Zhang __syncthreads(); 325*07e43b41SHong Zhang if (threadIdx.y < 1) { 326*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 327*07e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 328*07e43b41SHong Zhang } 329*07e43b41SHong Zhang } 330*07e43b41SHong Zhang } 331*07e43b41SHong Zhang 332*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 333*07e43b41SHong Zhang { 334*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 335*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 336*07e43b41SHong Zhang /* multiple threads per row. */ 337*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 338*07e43b41SHong Zhang if (row < nrows) { 339*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 340*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 341*07e43b41SHong Zhang 342*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 343*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 344*07e43b41SHong Zhang __syncthreads(); 345*07e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 346*07e43b41SHong Zhang __syncthreads(); 347*07e43b41SHong Zhang if (threadIdx.y < 1) { 348*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 349*07e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 350*07e43b41SHong Zhang } 351*07e43b41SHong Zhang } 352*07e43b41SHong Zhang } 353*07e43b41SHong Zhang 354*07e43b41SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 355*07e43b41SHong Zhang { 356*07e43b41SHong Zhang __shared__ MatScalar shared[512]; 357*07e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 358*07e43b41SHong Zhang /* multiple threads per row. */ 359*07e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 360*07e43b41SHong Zhang if (row < nrows) { 361*07e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 362*07e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 363*07e43b41SHong Zhang 364*07e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 365*07e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 366*07e43b41SHong Zhang __syncthreads(); 367*07e43b41SHong Zhang if (threadIdx.y < 1) { 368*07e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 369*07e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 370*07e43b41SHong Zhang } 371*07e43b41SHong Zhang } 3722d1451d4SHong Zhang } 3732d1451d4SHong Zhang 3742d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 3752d1451d4SHong Zhang { 3762d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 3772d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 3782d1451d4SHong Zhang PetscScalar *y; 3792d1451d4SHong Zhang const PetscScalar *x; 3802d1451d4SHong Zhang PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 3812d1451d4SHong Zhang MatScalar *aval; 3822d1451d4SHong Zhang PetscInt *acolidx; 3832d1451d4SHong Zhang PetscInt *sliidx; 384*07e43b41SHong Zhang PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 385*07e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 386*07e43b41SHong Zhang dim3 block_k8(32, SLICE_HEIGHT); 3872d1451d4SHong Zhang 3882d1451d4SHong Zhang PetscFunctionBegin; 389*07e43b41SHong Zhang PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 3902d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 3912d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 3922d1451d4SHong Zhang aval = cudastruct->val; 3932d1451d4SHong Zhang acolidx = cudastruct->colidx; 3942d1451d4SHong Zhang sliidx = cudastruct->sliidx; 3952d1451d4SHong Zhang 3962d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 3972d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y)); 3982d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 399*07e43b41SHong Zhang 400*07e43b41SHong Zhang switch (cudastruct->kernelchoice) { 401*07e43b41SHong Zhang case 9: 402*07e43b41SHong Zhang if (a->maxslicewidth > 512) { 403*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 404*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 405*07e43b41SHong Zhang } else if (a->avgslicewidth < 8) { 406*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 407*07e43b41SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 408*07e43b41SHong Zhang } else if (a->avgslicewidth < 16) { 409*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 410*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 411*07e43b41SHong Zhang } else if (a->avgslicewidth < 32) { 412*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 413*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 414*07e43b41SHong Zhang } else if (a->avgslicewidth < 64) { 415*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 416*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 417*07e43b41SHong Zhang } else { 418*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 419*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 420*07e43b41SHong Zhang } 421*07e43b41SHong Zhang break; 422*07e43b41SHong Zhang case 8: 423*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 424*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 425*07e43b41SHong Zhang break; 426*07e43b41SHong Zhang case 7: 427*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 428*07e43b41SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 429*07e43b41SHong Zhang break; 430*07e43b41SHong Zhang case 6: 431*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 432*07e43b41SHong Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 433*07e43b41SHong Zhang break; 434*07e43b41SHong Zhang case 5: 435*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 436*07e43b41SHong Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 437*07e43b41SHong Zhang break; 438*07e43b41SHong Zhang case 4: 439*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 440*07e43b41SHong Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 441*07e43b41SHong Zhang break; 442*07e43b41SHong Zhang case 3: 443*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 444*07e43b41SHong Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 445*07e43b41SHong Zhang break; 446*07e43b41SHong Zhang case 2: /* 16 slices per block if blocksize=512 */ 447*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 448*07e43b41SHong Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 449*07e43b41SHong Zhang break; 450*07e43b41SHong Zhang case 1: /* 32 slices per block if blocksize=512 */ 451*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 4522d1451d4SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 453*07e43b41SHong Zhang break; 454*07e43b41SHong Zhang case 0: 455*07e43b41SHong Zhang if (a->maxslicewidth > 64) { 456*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 457*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 4582d1451d4SHong Zhang } else { 459*07e43b41SHong Zhang if (a->avgslicewidth < 8) { 460*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 461*07e43b41SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 462*07e43b41SHong Zhang } else if (a->avgslicewidth < 16) { 463*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 464*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 465*07e43b41SHong Zhang } else if (a->avgslicewidth < 32) { 466*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 467*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 4682d1451d4SHong Zhang } else { 469*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 470*07e43b41SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 4712d1451d4SHong Zhang } 4722d1451d4SHong Zhang } 473*07e43b41SHong Zhang break; 474*07e43b41SHong Zhang } 4752d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 4762d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 4772d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 4782d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 4792d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 4802d1451d4SHong Zhang } 4812d1451d4SHong Zhang 4822d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 4832d1451d4SHong Zhang { 4842d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 4852d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 4862d1451d4SHong Zhang PetscScalar *z; 4872d1451d4SHong Zhang const PetscScalar *y, *x; 4882d1451d4SHong Zhang PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 4892d1451d4SHong Zhang MatScalar *aval = cudastruct->val; 4902d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx; 4912d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx; 4922d1451d4SHong Zhang 4932d1451d4SHong Zhang PetscFunctionBegin; 494*07e43b41SHong Zhang PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 4952d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 4962d1451d4SHong Zhang if (a->nz) { 497*07e43b41SHong Zhang PetscInt nblocks, blocksize = 512; 498*07e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 4992d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 5002d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y)); 5012d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z)); 5022d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 503*07e43b41SHong Zhang 504*07e43b41SHong Zhang switch (cudastruct->kernelchoice) { 505*07e43b41SHong Zhang case 6: 506*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); 507*07e43b41SHong Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 508*07e43b41SHong Zhang break; 509*07e43b41SHong Zhang case 5: 510*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); 511*07e43b41SHong Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 512*07e43b41SHong Zhang break; 513*07e43b41SHong Zhang case 4: 514*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); 515*07e43b41SHong Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 516*07e43b41SHong Zhang break; 517*07e43b41SHong Zhang case 3: 518*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); 519*07e43b41SHong Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 520*07e43b41SHong Zhang break; 521*07e43b41SHong Zhang case 2: 522*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 523*07e43b41SHong Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 524*07e43b41SHong Zhang break; 525*07e43b41SHong Zhang case 1: 526*07e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 5272d1451d4SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 528*07e43b41SHong Zhang break; 529*07e43b41SHong Zhang case 0: /* TODO */ 530*07e43b41SHong Zhang break; 5312d1451d4SHong Zhang } 5322d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 5332d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 5342d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y)); 5352d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 5362d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 5372d1451d4SHong Zhang } else { 5382d1451d4SHong Zhang PetscCall(VecCopy(yy, zz)); 5392d1451d4SHong Zhang } 5402d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5412d1451d4SHong Zhang } 5422d1451d4SHong Zhang 5432d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 5442d1451d4SHong Zhang { 545*07e43b41SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 546*07e43b41SHong Zhang PetscInt kernel; 547*07e43b41SHong Zhang PetscBool flg; 548*07e43b41SHong Zhang 5492d1451d4SHong Zhang PetscFunctionBegin; 5502d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 551*07e43b41SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 552*07e43b41SHong Zhang if (flg) { 553*07e43b41SHong Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 554*07e43b41SHong Zhang cudastruct->kernelchoice = kernel; 555*07e43b41SHong Zhang } 5562d1451d4SHong Zhang PetscOptionsHeadEnd(); 5572d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5582d1451d4SHong Zhang } 5592d1451d4SHong Zhang 560*07e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 561*07e43b41SHong Zhang { 562*07e43b41SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 563*07e43b41SHong Zhang 564*07e43b41SHong Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 565*07e43b41SHong Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 566*07e43b41SHong Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 567*07e43b41SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 568*07e43b41SHong Zhang } 569*07e43b41SHong Zhang 5702d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 5712d1451d4SHong Zhang { 5722d1451d4SHong Zhang PetscFunctionBegin; 5732d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 574*07e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 5752d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 5762d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 5772d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA; 5782d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA; 5792d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5802d1451d4SHong Zhang } 5812d1451d4SHong Zhang 5822d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 5832d1451d4SHong Zhang { 5842d1451d4SHong Zhang PetscFunctionBegin; 5852d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { 5862d1451d4SHong Zhang if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 5872d1451d4SHong Zhang } 5882d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A)); 5892d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5902d1451d4SHong Zhang } 5912d1451d4SHong Zhang 5922d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 5932d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 5942d1451d4SHong Zhang { 5952d1451d4SHong Zhang PetscFunctionBegin; 5962d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 5972d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 5982d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 5992d1451d4SHong Zhang } 6002d1451d4SHong Zhang 6012d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 6022d1451d4SHong Zhang { 6032d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct; 6042d1451d4SHong Zhang 6052d1451d4SHong Zhang PetscFunctionBegin; 6062d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype)); 6072d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 6082d1451d4SHong Zhang 6092d1451d4SHong Zhang if (!B->spptr) { 6102d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) { 6112d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct)); 6122d1451d4SHong Zhang B->spptr = cudastruct; 6132d1451d4SHong Zhang } 6142d1451d4SHong Zhang } 6152d1451d4SHong Zhang 6162d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 6172d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA; 6182d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 6192d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA; 6202d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA; 6212d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 6222d1451d4SHong Zhang 623*07e43b41SHong Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 624*07e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 625*07e43b41SHong Zhang 6262d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 6272d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 6282d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6292d1451d4SHong Zhang } 6302d1451d4SHong Zhang 6312d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 6322d1451d4SHong Zhang { 6332d1451d4SHong Zhang PetscFunctionBegin; 6342d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B)); 6352d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 6362d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6372d1451d4SHong Zhang } 638