12d1451d4SHong Zhang #include <petscdevice_cuda.h> 26eb97cccSStefano Zampini #include <petsc/private/cupmatomics.hpp> 32d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 42d1451d4SHong Zhang 507e43b41SHong Zhang #define SLICE_HEIGHT 16 607e43b41SHong Zhang 72d1451d4SHong Zhang typedef struct { 890d2215bSHong Zhang PetscInt maxallocmat; 990d2215bSHong Zhang PetscInt totalentries; 1090d2215bSHong Zhang PetscInt *colidx; /* column index array, device pointer */ 1190d2215bSHong Zhang MatScalar *val; /* value array, device pointer */ 1290d2215bSHong Zhang PetscInt totalslices; 1390d2215bSHong Zhang PetscInt *sliidx; /* slice index array, device pointer */ 142d1451d4SHong Zhang PetscInt nonzerostate; 1507e43b41SHong Zhang PetscInt kernelchoice; 164e58db63SHong Zhang PetscInt blocky; 1790d2215bSHong Zhang PetscInt chunksperblock; 1890d2215bSHong Zhang PetscInt totalchunks; 1990d2215bSHong Zhang PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 202d1451d4SHong Zhang } Mat_SeqSELLCUDA; 212d1451d4SHong Zhang 222d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 232d1451d4SHong Zhang { 242d1451d4SHong Zhang PetscFunctionBegin; 252d1451d4SHong Zhang if (*cudastruct) { 263a7d0413SPierre Jolivet if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx)); 273a7d0413SPierre Jolivet if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val)); 283a7d0413SPierre Jolivet if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); 293a7d0413SPierre Jolivet if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); 302d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct)); 312d1451d4SHong Zhang } 322d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 332d1451d4SHong Zhang } 342d1451d4SHong Zhang 352d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 362d1451d4SHong Zhang { 372d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 382d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 392d1451d4SHong Zhang 402d1451d4SHong Zhang PetscFunctionBegin; 412d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 422d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 432d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 442d1451d4SHong Zhang /* copy values only */ 452d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 462d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 472d1451d4SHong Zhang } else { 483a7d0413SPierre Jolivet if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx)); 493a7d0413SPierre Jolivet if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val)); 503a7d0413SPierre Jolivet if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx)); 513a7d0413SPierre Jolivet if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); 5290d2215bSHong Zhang cudastruct->maxallocmat = a->maxallocmat; 5390d2215bSHong Zhang cudastruct->totalentries = a->sliidx[a->totalslices]; 5490d2215bSHong Zhang cudastruct->totalslices = a->totalslices; 5590d2215bSHong Zhang cudastruct->totalchunks = a->totalchunks; 56f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx))); 57f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val))); 582d1451d4SHong Zhang /* copy values, nz or maxallocmat? */ 59f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice)); 60f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice)); 6107e43b41SHong Zhang 62f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx))); 63f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice)); 64f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map))); 65f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice)); 6690d2215bSHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 672d1451d4SHong Zhang } 682d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 692d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 702d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 712d1451d4SHong Zhang } 722d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 732d1451d4SHong Zhang } 742d1451d4SHong Zhang 7566976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 762d1451d4SHong Zhang { 772d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 782d1451d4SHong Zhang MatScalar sum; 792d1451d4SHong Zhang /* one thread per row. */ 802d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 812d1451d4SHong Zhang if (row < nrows) { 824e58db63SHong Zhang slice_id = row / sliceheight; 834e58db63SHong Zhang row_in_slice = row % sliceheight; 842d1451d4SHong Zhang sum = 0.0; 854e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 862d1451d4SHong Zhang y[row] = sum; 872d1451d4SHong Zhang } 882d1451d4SHong Zhang } 892d1451d4SHong Zhang 9066976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 912d1451d4SHong Zhang { 922d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 932d1451d4SHong Zhang MatScalar sum; 942d1451d4SHong Zhang /* one thread per row. */ 952d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 962d1451d4SHong Zhang if (row < nrows) { 974e58db63SHong Zhang slice_id = row / sliceheight; 984e58db63SHong Zhang row_in_slice = row % sliceheight; 992d1451d4SHong Zhang sum = 0.0; 1004e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 1012d1451d4SHong Zhang z[row] = y[row] + sum; 1022d1451d4SHong Zhang } 1032d1451d4SHong Zhang } 10407e43b41SHong Zhang 1058711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX) 10607e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */ 10707e43b41SHong Zhang template <int BLOCKY> 1084e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 10907e43b41SHong Zhang { 1104e58db63SHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 1114e58db63SHong Zhang PetscInt i, row, slice_id = blockIdx.x; 1124e58db63SHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 11307e43b41SHong Zhang /* transposed index */ 11407e43b41SHong Zhang int tidx = tid % BLOCKY; 11507e43b41SHong Zhang int tidy = tid / BLOCKY; 11607e43b41SHong Zhang PetscScalar t = 0.0; 1174e58db63SHong Zhang 1184e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 11907e43b41SHong Zhang if (row < nrows) { 1204e58db63SHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 1212d1451d4SHong Zhang } 1224e58db63SHong Zhang #pragma unroll 123*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 12407e43b41SHong Zhang /* transpose layout to reduce each row using warp shfl */ 1251f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 12607e43b41SHong Zhang __syncthreads(); 1271f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 12807e43b41SHong Zhang #pragma unroll 129*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 130*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 13107e43b41SHong Zhang __syncthreads(); 132*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; 1332d1451d4SHong Zhang } 1342d1451d4SHong Zhang 135cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */ 136cca9ff8bSHong Zhang template <int BLOCKY> 137cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 138cca9ff8bSHong Zhang { 139cca9ff8bSHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 140cca9ff8bSHong Zhang PetscInt i, row, slice_id = blockIdx.x; 141cca9ff8bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 142cca9ff8bSHong Zhang /* transposed index */ 143cca9ff8bSHong Zhang int tidx = tid % BLOCKY; 144cca9ff8bSHong Zhang int tidy = tid / BLOCKY; 145cca9ff8bSHong Zhang PetscScalar t = 0.0; 146cca9ff8bSHong Zhang 147cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 148cca9ff8bSHong Zhang if (row < nrows) { 149cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 150cca9ff8bSHong Zhang } 151cca9ff8bSHong Zhang #pragma unroll 152*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 153cca9ff8bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 1541f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 155cca9ff8bSHong Zhang __syncthreads(); 1561f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 157cca9ff8bSHong Zhang #pragma unroll 158*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 159*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 160cca9ff8bSHong Zhang __syncthreads(); 161*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x]; 162cca9ff8bSHong Zhang } 163cca9ff8bSHong Zhang 16490d2215bSHong Zhang template <int BLOCKY> 16566976f2fSJacob Faibussowitsch __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 16690d2215bSHong Zhang { 16790d2215bSHong Zhang bool head = true; 16890d2215bSHong Zhang #pragma unroll 16990d2215bSHong Zhang for (int i = 1; i < BLOCKY * 2; i <<= 1) { 17090d2215bSHong Zhang int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 17190d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = 0; 17290d2215bSHong Zhang if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 17390d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = *val; 17490d2215bSHong Zhang if (i == 1) head = false; 17590d2215bSHong Zhang } 17690d2215bSHong Zhang __syncthreads(); 17790d2215bSHong Zhang if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 17890d2215bSHong Zhang __syncthreads(); 17990d2215bSHong Zhang } 18090d2215bSHong Zhang return head; 18190d2215bSHong Zhang } 18290d2215bSHong Zhang 18390d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 18490d2215bSHong Zhang template <int BLOCKY> 18590d2215bSHong Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 18690d2215bSHong Zhang { 18790d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32]; 18890d2215bSHong Zhang PetscInt gid, row, start_slice, cid; 18990d2215bSHong Zhang PetscScalar t = 0.0; 1906eb97cccSStefano Zampini AtomicAdd<MatScalar> atomAdd; 19190d2215bSHong Zhang /* zero out y */ 19290d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 19390d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 19490d2215bSHong Zhang if (gid < nrows) y[gid] = 0.0; 19590d2215bSHong Zhang } 19690d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) { 19790d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 19890d2215bSHong Zhang if (cid < totalchunks) { 19990d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 20090d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 20190d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 20290d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2]; 20390d2215bSHong Zhang bool write; 204b7c0efcaSStefano Zampini PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 20590d2215bSHong Zhang /* find out the slice that this element belongs to */ 20690d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 20790d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 20890d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 20990d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 21090d2215bSHong Zhang __syncthreads(); 21190d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 2126eb97cccSStefano Zampini if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 21390d2215bSHong Zhang t = 0.0; 21490d2215bSHong Zhang } else { /* this iteration covers only one slice */ 21590d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 21690d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 21790d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 21890d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 21990d2215bSHong Zhang /* reduction and write to output vector */ 22090d2215bSHong Zhang #pragma unroll 221*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 22290d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 22390d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 22490d2215bSHong Zhang __syncthreads(); 22590d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 22690d2215bSHong Zhang #pragma unroll 227*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 228*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 22990d2215bSHong Zhang __syncthreads(); 2306eb97cccSStefano Zampini if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 23190d2215bSHong Zhang t = 0.0; 23290d2215bSHong Zhang } 23390d2215bSHong Zhang } 23490d2215bSHong Zhang } 23590d2215bSHong Zhang } 23690d2215bSHong Zhang } 23790d2215bSHong Zhang 23890d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 23990d2215bSHong Zhang template <int BLOCKY> 24090d2215bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 24190d2215bSHong Zhang { 24290d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32]; 24390d2215bSHong Zhang PetscInt gid, row, start_slice, cid; 24490d2215bSHong Zhang PetscScalar t = 0.0; 2456eb97cccSStefano Zampini AtomicAdd<MatScalar> atomAdd; 24690d2215bSHong Zhang /* copy y to z */ 24790d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 24890d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 24990d2215bSHong Zhang if (gid < nrows) z[gid] = y[gid]; 25090d2215bSHong Zhang } 25190d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) { 25290d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 25390d2215bSHong Zhang if (cid < totalchunks) { 25490d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 25590d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 25690d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 25790d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2]; 25890d2215bSHong Zhang bool write; 259b7c0efcaSStefano Zampini PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 26090d2215bSHong Zhang /* find out the slice that this element belongs to */ 26190d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 26290d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 26390d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 26490d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 26590d2215bSHong Zhang __syncthreads(); 26690d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 2676eb97cccSStefano Zampini if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 26890d2215bSHong Zhang t = 0.0; 26990d2215bSHong Zhang } else { /* this iteration covers only one slice */ 27090d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 27190d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 27290d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 27390d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 27490d2215bSHong Zhang /* reduction and write to output vector */ 27590d2215bSHong Zhang #pragma unroll 276*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 27790d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 27890d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 27990d2215bSHong Zhang __syncthreads(); 28090d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 28190d2215bSHong Zhang #pragma unroll 282*ac530a7eSPierre Jolivet for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); 283*ac530a7eSPierre Jolivet if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 28490d2215bSHong Zhang __syncthreads(); 2856eb97cccSStefano Zampini if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 28690d2215bSHong Zhang t = 0.0; 28790d2215bSHong Zhang } 28890d2215bSHong Zhang } 28990d2215bSHong Zhang } 29090d2215bSHong Zhang } 29190d2215bSHong Zhang } 29290d2215bSHong Zhang 29307e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */ 29466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 2952d1451d4SHong Zhang { 29607e43b41SHong Zhang PetscInt i, row, slice_id; 29707e43b41SHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 2984e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 29907e43b41SHong Zhang double t = 0.0; 30007e43b41SHong Zhang if (row < nrows) { 30107e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 30207e43b41SHong Zhang } 3034e58db63SHong Zhang #pragma unroll 304*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 305*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.x < sliceheight) y[row] = t; 30607e43b41SHong Zhang } 30707e43b41SHong Zhang 308cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */ 30966976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 310cca9ff8bSHong Zhang { 311cca9ff8bSHong Zhang PetscInt i, row, slice_id; 312cca9ff8bSHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 313cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 314cca9ff8bSHong Zhang double t = 0.0; 315cca9ff8bSHong Zhang if (row < nrows) { 316cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 317cca9ff8bSHong Zhang } 318cca9ff8bSHong Zhang #pragma unroll 319*ac530a7eSPierre Jolivet for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset); 320*ac530a7eSPierre Jolivet if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; 321cca9ff8bSHong Zhang } 3228711c661SHong Zhang #endif 323cca9ff8bSHong Zhang 324a9dd396cSHong Zhang /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 325a9dd396cSHong Zhang 32666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 32707e43b41SHong Zhang { 32807e43b41SHong Zhang __shared__ MatScalar shared[512]; 3292d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 33007e43b41SHong Zhang /* multiple threads per row. */ 3312d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 3322d1451d4SHong Zhang if (row < nrows) { 3332d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 3342d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 3352d1451d4SHong Zhang 3362d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 3372d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 33807e43b41SHong Zhang __syncthreads(); 339*ac530a7eSPierre Jolivet if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 34007e43b41SHong Zhang __syncthreads(); 341*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 3422d1451d4SHong Zhang __syncthreads(); 343*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 3442d1451d4SHong Zhang __syncthreads(); 345*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 3462d1451d4SHong Zhang __syncthreads(); 3472d1451d4SHong Zhang if (threadIdx.y < 1) { 34807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 3492d1451d4SHong Zhang y[row] = shared[threadIdx.x]; 3502d1451d4SHong Zhang } 3512d1451d4SHong Zhang } 3522d1451d4SHong Zhang } 3532d1451d4SHong Zhang 35466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 3552d1451d4SHong Zhang { 35607e43b41SHong Zhang __shared__ MatScalar shared[512]; 3572d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 35807e43b41SHong Zhang /* multiple threads per row. */ 3592d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 3602d1451d4SHong Zhang if (row < nrows) { 3612d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 3622d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 3632d1451d4SHong Zhang 3642d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 3652d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 36607e43b41SHong Zhang __syncthreads(); 367*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 3682d1451d4SHong Zhang __syncthreads(); 369*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 3702d1451d4SHong Zhang __syncthreads(); 371*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 3722d1451d4SHong Zhang __syncthreads(); 3732d1451d4SHong Zhang if (threadIdx.y < 1) { 37407e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 37507e43b41SHong Zhang y[row] = shared[threadIdx.x]; 37607e43b41SHong Zhang } 37707e43b41SHong Zhang } 37807e43b41SHong Zhang } 37907e43b41SHong Zhang 38066976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 38107e43b41SHong Zhang { 38207e43b41SHong Zhang __shared__ MatScalar shared[512]; 38307e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 38407e43b41SHong Zhang /* multiple threads per row. */ 38507e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 38607e43b41SHong Zhang if (row < nrows) { 38707e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 38807e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 38907e43b41SHong Zhang 39007e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 39107e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 39207e43b41SHong Zhang __syncthreads(); 393*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 39407e43b41SHong Zhang __syncthreads(); 395*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 39607e43b41SHong Zhang __syncthreads(); 39707e43b41SHong Zhang if (threadIdx.y < 1) { 39807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 39907e43b41SHong Zhang y[row] = shared[threadIdx.x]; 40007e43b41SHong Zhang } 40107e43b41SHong Zhang } 40207e43b41SHong Zhang } 40307e43b41SHong Zhang 40466976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 40507e43b41SHong Zhang { 40607e43b41SHong Zhang __shared__ MatScalar shared[512]; 40707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 40807e43b41SHong Zhang /* multiple threads per row. */ 40907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 41007e43b41SHong Zhang if (row < nrows) { 41107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 41207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 41307e43b41SHong Zhang 41407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 41507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 41607e43b41SHong Zhang __syncthreads(); 417*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 41807e43b41SHong Zhang __syncthreads(); 41907e43b41SHong Zhang if (threadIdx.y < 1) { 42007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 42107e43b41SHong Zhang y[row] = shared[threadIdx.x]; 42207e43b41SHong Zhang } 42307e43b41SHong Zhang } 42407e43b41SHong Zhang } 42507e43b41SHong Zhang 42666976f2fSJacob Faibussowitsch static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 42707e43b41SHong Zhang { 42807e43b41SHong Zhang __shared__ MatScalar shared[512]; 42907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 43007e43b41SHong Zhang /* multiple threads per row. */ 43107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 43207e43b41SHong Zhang if (row < nrows) { 43307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 43407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 43507e43b41SHong Zhang 43607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 43707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 43807e43b41SHong Zhang __syncthreads(); 43907e43b41SHong Zhang if (threadIdx.y < 1) { 44007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 44107e43b41SHong Zhang y[row] = shared[threadIdx.x]; 44207e43b41SHong Zhang } 44307e43b41SHong Zhang } 44407e43b41SHong Zhang } 44507e43b41SHong Zhang 44666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 44707e43b41SHong Zhang { 44807e43b41SHong Zhang __shared__ MatScalar shared[512]; 44907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 45007e43b41SHong Zhang /* multiple threads per row. */ 45107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 45207e43b41SHong Zhang if (row < nrows) { 45307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 45407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 45507e43b41SHong Zhang 45607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 45707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 45807e43b41SHong Zhang __syncthreads(); 459*ac530a7eSPierre Jolivet if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 46007e43b41SHong Zhang __syncthreads(); 461*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 46207e43b41SHong Zhang __syncthreads(); 463*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 46407e43b41SHong Zhang __syncthreads(); 465*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 46607e43b41SHong Zhang __syncthreads(); 46707e43b41SHong Zhang if (threadIdx.y < 1) { 46807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 4692d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 4702d1451d4SHong Zhang } 4712d1451d4SHong Zhang } 4722d1451d4SHong Zhang } 47307e43b41SHong Zhang 47466976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 47507e43b41SHong Zhang { 47607e43b41SHong Zhang __shared__ MatScalar shared[512]; 47707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 47807e43b41SHong Zhang /* multiple threads per row. */ 47907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 48007e43b41SHong Zhang if (row < nrows) { 48107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 48207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 48307e43b41SHong Zhang 48407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 48507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 48607e43b41SHong Zhang __syncthreads(); 487*ac530a7eSPierre Jolivet if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 48807e43b41SHong Zhang __syncthreads(); 489*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 49007e43b41SHong Zhang __syncthreads(); 491*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 49207e43b41SHong Zhang __syncthreads(); 49307e43b41SHong Zhang if (threadIdx.y < 1) { 49407e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 49507e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 49607e43b41SHong Zhang } 49707e43b41SHong Zhang } 49807e43b41SHong Zhang } 49907e43b41SHong Zhang 50066976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 50107e43b41SHong Zhang { 50207e43b41SHong Zhang __shared__ MatScalar shared[512]; 50307e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 50407e43b41SHong Zhang /* multiple threads per row. */ 50507e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 50607e43b41SHong Zhang if (row < nrows) { 50707e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 50807e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 50907e43b41SHong Zhang 51007e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 51107e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 51207e43b41SHong Zhang __syncthreads(); 513*ac530a7eSPierre Jolivet if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 51407e43b41SHong Zhang __syncthreads(); 515*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 51607e43b41SHong Zhang __syncthreads(); 51707e43b41SHong Zhang if (threadIdx.y < 1) { 51807e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 51907e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 52007e43b41SHong Zhang } 52107e43b41SHong Zhang } 52207e43b41SHong Zhang } 52307e43b41SHong Zhang 52466976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 52507e43b41SHong Zhang { 52607e43b41SHong Zhang __shared__ MatScalar shared[512]; 52707e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 52807e43b41SHong Zhang /* multiple threads per row. */ 52907e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 53007e43b41SHong Zhang if (row < nrows) { 53107e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 53207e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 53307e43b41SHong Zhang 53407e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 53507e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 53607e43b41SHong Zhang __syncthreads(); 537*ac530a7eSPierre Jolivet if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 53807e43b41SHong Zhang __syncthreads(); 53907e43b41SHong Zhang if (threadIdx.y < 1) { 54007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 54107e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 54207e43b41SHong Zhang } 54307e43b41SHong Zhang } 54407e43b41SHong Zhang } 54507e43b41SHong Zhang 54666976f2fSJacob Faibussowitsch static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 54707e43b41SHong Zhang { 54807e43b41SHong Zhang __shared__ MatScalar shared[512]; 54907e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 55007e43b41SHong Zhang /* multiple threads per row. */ 55107e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 55207e43b41SHong Zhang if (row < nrows) { 55307e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 55407e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 55507e43b41SHong Zhang 55607e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 55707e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 55807e43b41SHong Zhang __syncthreads(); 55907e43b41SHong Zhang if (threadIdx.y < 1) { 56007e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 56107e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 56207e43b41SHong Zhang } 56307e43b41SHong Zhang } 5642d1451d4SHong Zhang } 5652d1451d4SHong Zhang 56666976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 5672d1451d4SHong Zhang { 5682d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 5692d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 5702d1451d4SHong Zhang PetscScalar *y; 5712d1451d4SHong Zhang const PetscScalar *x; 572a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 5732d1451d4SHong Zhang MatScalar *aval; 5742d1451d4SHong Zhang PetscInt *acolidx; 5752d1451d4SHong Zhang PetscInt *sliidx; 57607e43b41SHong Zhang PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 57707e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 57816a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX) 57916a9b8deSJose E. Roman PetscInt chunksperblock, nchunks, *chunk_slice_map; 580b921024eSHong Zhang PetscReal maxoveravg; 58116a9b8deSJose E. Roman #endif 5822d1451d4SHong Zhang 5832d1451d4SHong Zhang PetscFunctionBegin; 5844e58db63SHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 58590d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 5862d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 5872d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 5882d1451d4SHong Zhang aval = cudastruct->val; 5892d1451d4SHong Zhang acolidx = cudastruct->colidx; 5902d1451d4SHong Zhang sliidx = cudastruct->sliidx; 5912d1451d4SHong Zhang 5922d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 5932d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y)); 5942d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 59507e43b41SHong Zhang 59607e43b41SHong Zhang switch (cudastruct->kernelchoice) { 5978711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX) 59807e43b41SHong Zhang case 9: 5994e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 6004e58db63SHong Zhang if (cudastruct->blocky == 2) { 6014e58db63SHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6024e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 6034e58db63SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6044e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 6054e58db63SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6064e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 6074e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6084e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 6094e58db63SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 610cca9ff8bSHong Zhang } else { 611cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 61207e43b41SHong Zhang } 61307e43b41SHong Zhang break; 61407e43b41SHong Zhang case 7: 6154e58db63SHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 6164e58db63SHong Zhang if (cudastruct->blocky == 2) { 6174e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6184e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 6194e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6204e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 6214e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6224e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 6234e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6244e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 6254e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6264e58db63SHong Zhang } else { 6274e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6284e58db63SHong Zhang } 62907e43b41SHong Zhang break; 6308711c661SHong Zhang #endif 63107e43b41SHong Zhang case 6: 63207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 633a9dd396cSHong Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 63407e43b41SHong Zhang break; 63507e43b41SHong Zhang case 5: 63607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 637a9dd396cSHong Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 63807e43b41SHong Zhang break; 63907e43b41SHong Zhang case 4: 64007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 641a9dd396cSHong Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 64207e43b41SHong Zhang break; 64307e43b41SHong Zhang case 3: 64407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 645a9dd396cSHong Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 64607e43b41SHong Zhang break; 64707e43b41SHong Zhang case 2: /* 16 slices per block if blocksize=512 */ 64807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 649a9dd396cSHong Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 65007e43b41SHong Zhang break; 65107e43b41SHong Zhang case 1: /* 32 slices per block if blocksize=512 */ 65207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 6534e58db63SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 65407e43b41SHong Zhang break; 6558711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX) 65607e43b41SHong Zhang case 0: 65790d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 65890d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 65990d2215bSHong Zhang /* each block handles approximately one slice */ 66090d2215bSHong Zhang PetscInt blocky = a->chunksize / 32; 66190d2215bSHong Zhang nchunks = cudastruct->totalchunks; 66290d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 66390d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 66490d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 66590d2215bSHong Zhang if (blocky == 2) { 66690d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 66790d2215bSHong Zhang } else if (blocky == 4) { 66890d2215bSHong Zhang matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 66990d2215bSHong Zhang } else if (blocky == 8) { 67090d2215bSHong Zhang matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 67190d2215bSHong Zhang } else if (blocky == 16) { 67290d2215bSHong Zhang matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 67390d2215bSHong Zhang } else if (blocky == 32) { 67490d2215bSHong Zhang matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 67590d2215bSHong Zhang } else { 67690d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 67790d2215bSHong Zhang } 6782d1451d4SHong Zhang } else { 679cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 68090d2215bSHong Zhang if (avgslicesize <= 432) { 68190d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 682cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 683cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 68490d2215bSHong Zhang } else { 685cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 686cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 68790d2215bSHong Zhang } 688cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 689cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 690cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691cca9ff8bSHong Zhang } else { 6924e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 6934e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6942d1451d4SHong Zhang } 6952d1451d4SHong Zhang } 696cca9ff8bSHong Zhang break; 6978711c661SHong Zhang #endif 6988711c661SHong Zhang default: 6998711c661SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 700cca9ff8bSHong Zhang } 7012d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 7022d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 7032d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 7042d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 7052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 7062d1451d4SHong Zhang } 7072d1451d4SHong Zhang 70866976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 7092d1451d4SHong Zhang { 7102d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 7112d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 7122d1451d4SHong Zhang PetscScalar *z; 7132d1451d4SHong Zhang const PetscScalar *y, *x; 714a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 7152d1451d4SHong Zhang MatScalar *aval = cudastruct->val; 7162d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx; 7172d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx; 71816a9b8deSJose E. Roman #if !defined(PETSC_USE_COMPLEX) 71990d2215bSHong Zhang PetscReal maxoveravg; 72016a9b8deSJose E. Roman PetscInt chunksperblock, nchunks, *chunk_slice_map; 72116a9b8deSJose E. Roman PetscInt blocky = cudastruct->blocky; 72216a9b8deSJose E. Roman #endif 7232d1451d4SHong Zhang 7242d1451d4SHong Zhang PetscFunctionBegin; 72590d2215bSHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 72690d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 7272d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 7282d1451d4SHong Zhang if (a->nz) { 72916a9b8deSJose E. Roman PetscInt nblocks, blocksize = 512; 73007e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 7312d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 7322d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y)); 7332d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z)); 7342d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 73507e43b41SHong Zhang 73607e43b41SHong Zhang switch (cudastruct->kernelchoice) { 7378711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX) 738cca9ff8bSHong Zhang case 9: 739cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 74090d2215bSHong Zhang if (blocky == 2) { 741cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 74290d2215bSHong Zhang } else if (blocky == 4) { 743cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 74490d2215bSHong Zhang } else if (blocky == 8) { 745cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 74690d2215bSHong Zhang } else if (blocky == 16) { 747cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 74890d2215bSHong Zhang } else if (blocky == 32) { 749cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 750cca9ff8bSHong Zhang } else { 751cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 752cca9ff8bSHong Zhang } 753cca9ff8bSHong Zhang break; 75490d2215bSHong Zhang case 8: 75590d2215bSHong Zhang /* each block handles approximately one slice */ 75690d2215bSHong Zhang nchunks = cudastruct->totalchunks; 75790d2215bSHong Zhang blocky = a->chunksize / 32; 75890d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 75990d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 76090d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 76190d2215bSHong Zhang if (blocky == 2) { 76290d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 76390d2215bSHong Zhang } else if (blocky == 4) { 76490d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 76590d2215bSHong Zhang } else if (blocky == 8) { 76690d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 76790d2215bSHong Zhang } else if (blocky == 16) { 76890d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 76990d2215bSHong Zhang } else if (blocky == 32) { 77090d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 77190d2215bSHong Zhang } else { 77290d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 77390d2215bSHong Zhang } 77490d2215bSHong Zhang break; 775cca9ff8bSHong Zhang case 7: 776cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 77790d2215bSHong Zhang if (blocky == 2) { 778cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 77990d2215bSHong Zhang } else if (blocky == 4) { 780cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 78190d2215bSHong Zhang } else if (blocky == 8) { 782cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 78390d2215bSHong Zhang } else if (blocky == 16) { 784cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 78590d2215bSHong Zhang } else if (blocky == 32) { 786cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 787cca9ff8bSHong Zhang } else { 788cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 789cca9ff8bSHong Zhang } 790cca9ff8bSHong Zhang break; 7918711c661SHong Zhang #endif 79207e43b41SHong Zhang case 6: 79307e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); 794a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 79507e43b41SHong Zhang break; 79607e43b41SHong Zhang case 5: 79707e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); 798a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 79907e43b41SHong Zhang break; 80007e43b41SHong Zhang case 4: 80107e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); 802a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 80307e43b41SHong Zhang break; 80407e43b41SHong Zhang case 3: 80507e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); 806a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 80707e43b41SHong Zhang break; 80807e43b41SHong Zhang case 2: 80907e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 810a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 81107e43b41SHong Zhang break; 81207e43b41SHong Zhang case 1: 81307e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 8144e58db63SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 81507e43b41SHong Zhang break; 8168711c661SHong Zhang #if !defined(PETSC_USE_COMPLEX) 817cca9ff8bSHong Zhang case 0: 81890d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 81990d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 82090d2215bSHong Zhang /* each block handles approximately one slice */ 82190d2215bSHong Zhang nchunks = cudastruct->totalchunks; 82290d2215bSHong Zhang blocky = a->chunksize / 32; 82390d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 82490d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 82590d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 82690d2215bSHong Zhang if (blocky == 2) { 82790d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 82890d2215bSHong Zhang } else if (blocky == 4) { 82990d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 83090d2215bSHong Zhang } else if (blocky == 8) { 83190d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 83290d2215bSHong Zhang } else if (blocky == 16) { 83390d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 83490d2215bSHong Zhang } else if (blocky == 32) { 83590d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 83690d2215bSHong Zhang } else { 83790d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 83890d2215bSHong Zhang } 839cca9ff8bSHong Zhang } else { 840cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 84190d2215bSHong Zhang if (avgslicesize <= 432) { 84290d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 843cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 844cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 84590d2215bSHong Zhang } else { 846cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 847cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 84890d2215bSHong Zhang } 849cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 850cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 851cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 852cca9ff8bSHong Zhang } else { 853cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 854cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 855cca9ff8bSHong Zhang } 856cca9ff8bSHong Zhang } 85707e43b41SHong Zhang break; 8588711c661SHong Zhang #endif 8598711c661SHong Zhang default: 8608711c661SHong Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 8612d1451d4SHong Zhang } 8622d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 8632d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 8642d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y)); 8652d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 8662d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 8672d1451d4SHong Zhang } else { 8682d1451d4SHong Zhang PetscCall(VecCopy(yy, zz)); 8692d1451d4SHong Zhang } 8702d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 8712d1451d4SHong Zhang } 8722d1451d4SHong Zhang 873ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject) 8742d1451d4SHong Zhang { 87507e43b41SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 8764e58db63SHong Zhang PetscInt kernel, blocky; 87707e43b41SHong Zhang PetscBool flg; 87807e43b41SHong Zhang 8792d1451d4SHong Zhang PetscFunctionBegin; 8802d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 88190d2215bSHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 88290d2215bSHong Zhang if (flg) { 88390d2215bSHong Zhang PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 88490d2215bSHong Zhang cudastruct->blocky = blocky; 88590d2215bSHong Zhang } 88607e43b41SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 88707e43b41SHong Zhang if (flg) { 88807e43b41SHong Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 88907e43b41SHong Zhang cudastruct->kernelchoice = kernel; 8903a7d0413SPierre Jolivet if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); 8914e58db63SHong Zhang } 8922d1451d4SHong Zhang PetscOptionsHeadEnd(); 8932d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 8942d1451d4SHong Zhang } 8952d1451d4SHong Zhang 89607e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 89707e43b41SHong Zhang { 89807e43b41SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 89907e43b41SHong Zhang 90090d2215bSHong Zhang PetscFunctionBegin; 90107e43b41SHong Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 90207e43b41SHong Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 90307e43b41SHong Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 90407e43b41SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 90507e43b41SHong Zhang } 90607e43b41SHong Zhang 9072d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 9082d1451d4SHong Zhang { 9092d1451d4SHong Zhang PetscFunctionBegin; 9102d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 91107e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 9122d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 9133a7d0413SPierre Jolivet if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A)); 9142d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA; 9152d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA; 9162d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9172d1451d4SHong Zhang } 9182d1451d4SHong Zhang 9198df136f9SHong Zhang static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A) 9208df136f9SHong Zhang { 9218df136f9SHong Zhang PetscBool both = PETSC_FALSE; 9228df136f9SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 9238df136f9SHong Zhang 9248df136f9SHong Zhang PetscFunctionBegin; 9258df136f9SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { 9268df136f9SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 9278df136f9SHong Zhang if (cudastruct->val) { 9288df136f9SHong Zhang both = PETSC_TRUE; 929f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val))); 9308df136f9SHong Zhang } 9318df136f9SHong Zhang } 9328df136f9SHong Zhang PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 9338df136f9SHong Zhang if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 9348df136f9SHong Zhang else A->offloadmask = PETSC_OFFLOAD_CPU; 9358df136f9SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9368df136f9SHong Zhang } 9378df136f9SHong Zhang 9382d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 9392d1451d4SHong Zhang { 9402d1451d4SHong Zhang PetscFunctionBegin; 9418df136f9SHong Zhang if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); 9422d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A)); 9432d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9442d1451d4SHong Zhang } 9452d1451d4SHong Zhang 9462d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 9472d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 9482d1451d4SHong Zhang { 9492d1451d4SHong Zhang PetscFunctionBegin; 9502d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 9512d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 9522d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9532d1451d4SHong Zhang } 9542d1451d4SHong Zhang 9558df136f9SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 9562d1451d4SHong Zhang { 9572d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct; 9582d1451d4SHong Zhang 9592d1451d4SHong Zhang PetscFunctionBegin; 9602d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype)); 9612d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 9622d1451d4SHong Zhang 9632d1451d4SHong Zhang if (!B->spptr) { 9642d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) { 9652d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct)); 9662d1451d4SHong Zhang B->spptr = cudastruct; 9672d1451d4SHong Zhang } 9682d1451d4SHong Zhang } 9692d1451d4SHong Zhang 9702d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 9712d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA; 9722d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 9732d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA; 9742d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA; 9752d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 9768df136f9SHong Zhang B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA; 9772d1451d4SHong Zhang 97807e43b41SHong Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 97907e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 98007e43b41SHong Zhang 9812d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 9822d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 9832d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9842d1451d4SHong Zhang } 9852d1451d4SHong Zhang 986773bf0f6SHong Zhang /*MC 987773bf0f6SHong Zhang MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 988773bf0f6SHong Zhang 989773bf0f6SHong Zhang Options Database Keys: 990773bf0f6SHong Zhang + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()` 991773bf0f6SHong Zhang . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA 992773bf0f6SHong Zhang - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32 993773bf0f6SHong Zhang 994773bf0f6SHong Zhang Level: beginner 995773bf0f6SHong Zhang 996773bf0f6SHong Zhang .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA` 997773bf0f6SHong Zhang M*/ 998773bf0f6SHong Zhang 9992d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 10002d1451d4SHong Zhang { 10012d1451d4SHong Zhang PetscFunctionBegin; 10022d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B)); 10032d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 10048df136f9SHong Zhang PetscCall(MatSetFromOptions(B)); 10052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 10062d1451d4SHong Zhang } 1007