12d1451d4SHong Zhang #include <cuda_runtime.h> 22d1451d4SHong Zhang 32d1451d4SHong Zhang #include <petscdevice_cuda.h> 42d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 52d1451d4SHong Zhang 607e43b41SHong Zhang #define SLICE_HEIGHT 16 707e43b41SHong Zhang 82d1451d4SHong Zhang typedef struct { 9*90d2215bSHong Zhang PetscInt maxallocmat; 10*90d2215bSHong Zhang PetscInt totalentries; 11*90d2215bSHong Zhang PetscInt *colidx; /* column index array, device pointer */ 12*90d2215bSHong Zhang MatScalar *val; /* value array, device pointer */ 13*90d2215bSHong Zhang PetscInt totalslices; 14*90d2215bSHong Zhang PetscInt *sliidx; /* slice index array, device pointer */ 152d1451d4SHong Zhang PetscInt nonzerostate; 1607e43b41SHong Zhang PetscInt kernelchoice; 174e58db63SHong Zhang PetscInt blocky; 18*90d2215bSHong Zhang PetscInt chunksperblock; 19*90d2215bSHong Zhang PetscInt totalchunks; 20*90d2215bSHong Zhang PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 212d1451d4SHong Zhang } Mat_SeqSELLCUDA; 222d1451d4SHong Zhang 232d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 242d1451d4SHong Zhang { 252d1451d4SHong Zhang PetscFunctionBegin; 262d1451d4SHong Zhang if (*cudastruct) { 272d1451d4SHong Zhang if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 282d1451d4SHong Zhang if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 292d1451d4SHong Zhang if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 30*90d2215bSHong Zhang if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 312d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct)); 322d1451d4SHong Zhang } 332d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 342d1451d4SHong Zhang } 352d1451d4SHong Zhang 362d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 372d1451d4SHong Zhang { 382d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 392d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 402d1451d4SHong Zhang 412d1451d4SHong Zhang PetscFunctionBegin; 422d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 432d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 442d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 452d1451d4SHong Zhang /* copy values only */ 462d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 472d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 482d1451d4SHong Zhang } else { 492d1451d4SHong Zhang if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 502d1451d4SHong Zhang if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 512d1451d4SHong Zhang if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 52*90d2215bSHong Zhang if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 53*90d2215bSHong Zhang cudastruct->maxallocmat = a->maxallocmat; 54*90d2215bSHong Zhang cudastruct->totalentries = a->sliidx[a->totalslices]; 55*90d2215bSHong Zhang cudastruct->totalslices = a->totalslices; 56*90d2215bSHong Zhang cudastruct->totalchunks = a->totalchunks; 572d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 582d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 592d1451d4SHong Zhang /* copy values, nz or maxallocmat? */ 602d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 612d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 6207e43b41SHong Zhang 6307e43b41SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 6407e43b41SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 65*90d2215bSHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(PetscInt))); 66*90d2215bSHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(PetscInt), cudaMemcpyHostToDevice)); 67*90d2215bSHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 682d1451d4SHong Zhang } 692d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 702d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 712d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 722d1451d4SHong Zhang } 732d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 742d1451d4SHong Zhang } 752d1451d4SHong Zhang 764e58db63SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 772d1451d4SHong Zhang { 782d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 792d1451d4SHong Zhang MatScalar sum; 802d1451d4SHong Zhang /* one thread per row. */ 812d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 822d1451d4SHong Zhang if (row < nrows) { 834e58db63SHong Zhang slice_id = row / sliceheight; 844e58db63SHong Zhang row_in_slice = row % sliceheight; 852d1451d4SHong Zhang sum = 0.0; 864e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 872d1451d4SHong Zhang y[row] = sum; 882d1451d4SHong Zhang } 892d1451d4SHong Zhang } 902d1451d4SHong Zhang 914e58db63SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 922d1451d4SHong Zhang { 932d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 942d1451d4SHong Zhang MatScalar sum; 952d1451d4SHong Zhang /* one thread per row. */ 962d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 972d1451d4SHong Zhang if (row < nrows) { 984e58db63SHong Zhang slice_id = row / sliceheight; 994e58db63SHong Zhang row_in_slice = row % sliceheight; 1002d1451d4SHong Zhang sum = 0.0; 1014e58db63SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 1022d1451d4SHong Zhang z[row] = y[row] + sum; 1032d1451d4SHong Zhang } 1042d1451d4SHong Zhang } 10507e43b41SHong Zhang 10607e43b41SHong Zhang /* use 1 block per slice, suitable for large slice width */ 10707e43b41SHong Zhang template <int BLOCKY> 1084e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 10907e43b41SHong Zhang { 1104e58db63SHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 1114e58db63SHong Zhang PetscInt i, row, slice_id = blockIdx.x; 1124e58db63SHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 11307e43b41SHong Zhang /* transposed index */ 11407e43b41SHong Zhang int tidx = tid % BLOCKY; 11507e43b41SHong Zhang int tidy = tid / BLOCKY; 11607e43b41SHong Zhang PetscScalar t = 0.0; 1174e58db63SHong Zhang 1184e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 11907e43b41SHong Zhang if (row < nrows) { 1204e58db63SHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 1212d1451d4SHong Zhang } 1224e58db63SHong Zhang #pragma unroll 1234e58db63SHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 12407e43b41SHong Zhang /* transpose layout to reduce each row using warp shfl */ 1251f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 12607e43b41SHong Zhang __syncthreads(); 1271f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 12807e43b41SHong Zhang #pragma unroll 12907e43b41SHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 1304e58db63SHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 13107e43b41SHong Zhang __syncthreads(); 1324e58db63SHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 1332d1451d4SHong Zhang } 1342d1451d4SHong Zhang 135cca9ff8bSHong Zhang /* use 1 block per slice, suitable for large slice width */ 136cca9ff8bSHong Zhang template <int BLOCKY> 137cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 138cca9ff8bSHong Zhang { 139cca9ff8bSHong Zhang __shared__ MatScalar shared[32][BLOCKY]; 140cca9ff8bSHong Zhang PetscInt i, row, slice_id = blockIdx.x; 141cca9ff8bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32; 142cca9ff8bSHong Zhang /* transposed index */ 143cca9ff8bSHong Zhang int tidx = tid % BLOCKY; 144cca9ff8bSHong Zhang int tidy = tid / BLOCKY; 145cca9ff8bSHong Zhang PetscScalar t = 0.0; 146cca9ff8bSHong Zhang 147cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 148cca9ff8bSHong Zhang if (row < nrows) { 149cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 150cca9ff8bSHong Zhang } 151cca9ff8bSHong Zhang #pragma unroll 152cca9ff8bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 153cca9ff8bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 1541f0d1278SHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 155cca9ff8bSHong Zhang __syncthreads(); 1561f0d1278SHong Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 157cca9ff8bSHong Zhang #pragma unroll 158cca9ff8bSHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 159cca9ff8bSHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 160cca9ff8bSHong Zhang __syncthreads(); 161cca9ff8bSHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 162cca9ff8bSHong Zhang } 163cca9ff8bSHong Zhang 164*90d2215bSHong Zhang template <int BLOCKY> 165*90d2215bSHong Zhang __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 166*90d2215bSHong Zhang { 167*90d2215bSHong Zhang bool head = true; 168*90d2215bSHong Zhang #pragma unroll 169*90d2215bSHong Zhang for (int i = 1; i < BLOCKY * 2; i <<= 1) { 170*90d2215bSHong Zhang int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 171*90d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = 0; 172*90d2215bSHong Zhang if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 173*90d2215bSHong Zhang shared[threadIdx.x + threadIdx.y * 32] = *val; 174*90d2215bSHong Zhang if (i == 1) head = false; 175*90d2215bSHong Zhang } 176*90d2215bSHong Zhang __syncthreads(); 177*90d2215bSHong Zhang if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 178*90d2215bSHong Zhang __syncthreads(); 179*90d2215bSHong Zhang } 180*90d2215bSHong Zhang return head; 181*90d2215bSHong Zhang } 182*90d2215bSHong Zhang 183*90d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 184*90d2215bSHong Zhang template <int BLOCKY> 185*90d2215bSHong Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 186*90d2215bSHong Zhang { 187*90d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32]; 188*90d2215bSHong Zhang PetscInt gid, row, start_slice, cid; 189*90d2215bSHong Zhang PetscScalar t = 0.0; 190*90d2215bSHong Zhang /* zero out y */ 191*90d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 192*90d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 193*90d2215bSHong Zhang if (gid < nrows) y[gid] = 0.0; 194*90d2215bSHong Zhang } 195*90d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) { 196*90d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 197*90d2215bSHong Zhang if (cid < totalchunks) { 198*90d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 199*90d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 200*90d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 201*90d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2]; 202*90d2215bSHong Zhang bool write; 203*90d2215bSHong Zhang PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 204*90d2215bSHong Zhang /* find out the slice that this element belongs to */ 205*90d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 206*90d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 207*90d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 208*90d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 209*90d2215bSHong Zhang __syncthreads(); 210*90d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 211*90d2215bSHong Zhang if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t); 212*90d2215bSHong Zhang t = 0.0; 213*90d2215bSHong Zhang } else { /* this iteration covers only one slice */ 214*90d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 215*90d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 216*90d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 217*90d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 218*90d2215bSHong Zhang /* reduction and write to output vector */ 219*90d2215bSHong Zhang #pragma unroll 220*90d2215bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 221*90d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 222*90d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 223*90d2215bSHong Zhang __syncthreads(); 224*90d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 225*90d2215bSHong Zhang #pragma unroll 226*90d2215bSHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 227*90d2215bSHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 228*90d2215bSHong Zhang __syncthreads(); 229*90d2215bSHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 230*90d2215bSHong Zhang t = 0.0; 231*90d2215bSHong Zhang } 232*90d2215bSHong Zhang } 233*90d2215bSHong Zhang } 234*90d2215bSHong Zhang } 235*90d2215bSHong Zhang } 236*90d2215bSHong Zhang 237*90d2215bSHong Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 238*90d2215bSHong Zhang template <int BLOCKY> 239*90d2215bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 240*90d2215bSHong Zhang { 241*90d2215bSHong Zhang __shared__ MatScalar shared[BLOCKY * 32]; 242*90d2215bSHong Zhang PetscInt gid, row, start_slice, cid; 243*90d2215bSHong Zhang PetscScalar t = 0.0; 244*90d2215bSHong Zhang /* copy y to z */ 245*90d2215bSHong Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 246*90d2215bSHong Zhang gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 247*90d2215bSHong Zhang if (gid < nrows) z[gid] = y[gid]; 248*90d2215bSHong Zhang } 249*90d2215bSHong Zhang for (int iter = 0; iter < chunksperblock; iter++) { 250*90d2215bSHong Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 251*90d2215bSHong Zhang if (cid < totalchunks) { 252*90d2215bSHong Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 253*90d2215bSHong Zhang gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 254*90d2215bSHong Zhang if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 255*90d2215bSHong Zhang __shared__ PetscInt flag[BLOCKY * 2]; 256*90d2215bSHong Zhang bool write; 257*90d2215bSHong Zhang PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 258*90d2215bSHong Zhang /* find out the slice that this element belongs to */ 259*90d2215bSHong Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 260*90d2215bSHong Zhang if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 261*90d2215bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 262*90d2215bSHong Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 263*90d2215bSHong Zhang __syncthreads(); 264*90d2215bSHong Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 265*90d2215bSHong Zhang if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t); 266*90d2215bSHong Zhang t = 0.0; 267*90d2215bSHong Zhang } else { /* this iteration covers only one slice */ 268*90d2215bSHong Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 269*90d2215bSHong Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 270*90d2215bSHong Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 271*90d2215bSHong Zhang int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 272*90d2215bSHong Zhang /* reduction and write to output vector */ 273*90d2215bSHong Zhang #pragma unroll 274*90d2215bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 275*90d2215bSHong Zhang /* transpose layout to reduce each row using warp shfl */ 276*90d2215bSHong Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 277*90d2215bSHong Zhang __syncthreads(); 278*90d2215bSHong Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 279*90d2215bSHong Zhang #pragma unroll 280*90d2215bSHong Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 281*90d2215bSHong Zhang if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 282*90d2215bSHong Zhang __syncthreads(); 283*90d2215bSHong Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 284*90d2215bSHong Zhang t = 0.0; 285*90d2215bSHong Zhang } 286*90d2215bSHong Zhang } 287*90d2215bSHong Zhang } 288*90d2215bSHong Zhang } 289*90d2215bSHong Zhang } 290*90d2215bSHong Zhang 29107e43b41SHong Zhang /* use 1 warp per slice, suitable for small slice width */ 2924e58db63SHong Zhang __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 2932d1451d4SHong Zhang { 29407e43b41SHong Zhang PetscInt i, row, slice_id; 29507e43b41SHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 2964e58db63SHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 29707e43b41SHong Zhang double t = 0.0; 29807e43b41SHong Zhang if (row < nrows) { 29907e43b41SHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 30007e43b41SHong Zhang } 3014e58db63SHong Zhang #pragma unroll 3024e58db63SHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 3034e58db63SHong Zhang if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 30407e43b41SHong Zhang } 30507e43b41SHong Zhang 306cca9ff8bSHong Zhang /* use 1 warp per slice, suitable for small slice width */ 307cca9ff8bSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 308cca9ff8bSHong Zhang { 309cca9ff8bSHong Zhang PetscInt i, row, slice_id; 310cca9ff8bSHong Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 311cca9ff8bSHong Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 312cca9ff8bSHong Zhang double t = 0.0; 313cca9ff8bSHong Zhang if (row < nrows) { 314cca9ff8bSHong Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 315cca9ff8bSHong Zhang } 316cca9ff8bSHong Zhang #pragma unroll 317cca9ff8bSHong Zhang for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 318cca9ff8bSHong Zhang if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 319cca9ff8bSHong Zhang } 320cca9ff8bSHong Zhang 321a9dd396cSHong Zhang /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 322a9dd396cSHong Zhang 323a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 32407e43b41SHong Zhang { 32507e43b41SHong Zhang __shared__ MatScalar shared[512]; 3262d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 32707e43b41SHong Zhang /* multiple threads per row. */ 3282d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 3292d1451d4SHong Zhang if (row < nrows) { 3302d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 3312d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 3322d1451d4SHong Zhang 3332d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 3342d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 33507e43b41SHong Zhang __syncthreads(); 33607e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 33707e43b41SHong Zhang __syncthreads(); 33807e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 3392d1451d4SHong Zhang __syncthreads(); 3402d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 3412d1451d4SHong Zhang __syncthreads(); 3422d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 3432d1451d4SHong Zhang __syncthreads(); 3442d1451d4SHong Zhang if (threadIdx.y < 1) { 34507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 3462d1451d4SHong Zhang y[row] = shared[threadIdx.x]; 3472d1451d4SHong Zhang } 3482d1451d4SHong Zhang } 3492d1451d4SHong Zhang } 3502d1451d4SHong Zhang 351a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 3522d1451d4SHong Zhang { 35307e43b41SHong Zhang __shared__ MatScalar shared[512]; 3542d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 35507e43b41SHong Zhang /* multiple threads per row. */ 3562d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 3572d1451d4SHong Zhang if (row < nrows) { 3582d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 3592d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 3602d1451d4SHong Zhang 3612d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 3622d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 36307e43b41SHong Zhang __syncthreads(); 36407e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 3652d1451d4SHong Zhang __syncthreads(); 3662d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 3672d1451d4SHong Zhang __syncthreads(); 3682d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 3692d1451d4SHong Zhang __syncthreads(); 3702d1451d4SHong Zhang if (threadIdx.y < 1) { 37107e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 37207e43b41SHong Zhang y[row] = shared[threadIdx.x]; 37307e43b41SHong Zhang } 37407e43b41SHong Zhang } 37507e43b41SHong Zhang } 37607e43b41SHong Zhang 377a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 37807e43b41SHong Zhang { 37907e43b41SHong Zhang __shared__ MatScalar shared[512]; 38007e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 38107e43b41SHong Zhang /* multiple threads per row. */ 38207e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 38307e43b41SHong Zhang if (row < nrows) { 38407e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 38507e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 38607e43b41SHong Zhang 38707e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 38807e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 38907e43b41SHong Zhang __syncthreads(); 39007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 39107e43b41SHong Zhang __syncthreads(); 39207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 39307e43b41SHong Zhang __syncthreads(); 39407e43b41SHong Zhang if (threadIdx.y < 1) { 39507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 39607e43b41SHong Zhang y[row] = shared[threadIdx.x]; 39707e43b41SHong Zhang } 39807e43b41SHong Zhang } 39907e43b41SHong Zhang } 40007e43b41SHong Zhang 401a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 40207e43b41SHong Zhang { 40307e43b41SHong Zhang __shared__ MatScalar shared[512]; 40407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 40507e43b41SHong Zhang /* multiple threads per row. */ 40607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 40707e43b41SHong Zhang if (row < nrows) { 40807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 40907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 41007e43b41SHong Zhang 41107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 41207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 41307e43b41SHong Zhang __syncthreads(); 41407e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 41507e43b41SHong Zhang __syncthreads(); 41607e43b41SHong Zhang if (threadIdx.y < 1) { 41707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 41807e43b41SHong Zhang y[row] = shared[threadIdx.x]; 41907e43b41SHong Zhang } 42007e43b41SHong Zhang } 42107e43b41SHong Zhang } 42207e43b41SHong Zhang 423a9dd396cSHong Zhang __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 42407e43b41SHong Zhang { 42507e43b41SHong Zhang __shared__ MatScalar shared[512]; 42607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 42707e43b41SHong Zhang /* multiple threads per row. */ 42807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 42907e43b41SHong Zhang if (row < nrows) { 43007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 43107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 43207e43b41SHong Zhang 43307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 43407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 43507e43b41SHong Zhang __syncthreads(); 43607e43b41SHong Zhang if (threadIdx.y < 1) { 43707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 43807e43b41SHong Zhang y[row] = shared[threadIdx.x]; 43907e43b41SHong Zhang } 44007e43b41SHong Zhang } 44107e43b41SHong Zhang } 44207e43b41SHong Zhang 443a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 44407e43b41SHong Zhang { 44507e43b41SHong Zhang __shared__ MatScalar shared[512]; 44607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 44707e43b41SHong Zhang /* multiple threads per row. */ 44807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 44907e43b41SHong Zhang if (row < nrows) { 45007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 45107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 45207e43b41SHong Zhang 45307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 45407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 45507e43b41SHong Zhang __syncthreads(); 45607e43b41SHong Zhang if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 45707e43b41SHong Zhang __syncthreads(); 45807e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 45907e43b41SHong Zhang __syncthreads(); 46007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 46107e43b41SHong Zhang __syncthreads(); 46207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 46307e43b41SHong Zhang __syncthreads(); 46407e43b41SHong Zhang if (threadIdx.y < 1) { 46507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 4662d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 4672d1451d4SHong Zhang } 4682d1451d4SHong Zhang } 4692d1451d4SHong Zhang } 47007e43b41SHong Zhang 471a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 47207e43b41SHong Zhang { 47307e43b41SHong Zhang __shared__ MatScalar shared[512]; 47407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 47507e43b41SHong Zhang /* multiple threads per row. */ 47607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 47707e43b41SHong Zhang if (row < nrows) { 47807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 47907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 48007e43b41SHong Zhang 48107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 48207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 48307e43b41SHong Zhang __syncthreads(); 48407e43b41SHong Zhang if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 48507e43b41SHong Zhang __syncthreads(); 48607e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 48707e43b41SHong Zhang __syncthreads(); 48807e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 48907e43b41SHong Zhang __syncthreads(); 49007e43b41SHong Zhang if (threadIdx.y < 1) { 49107e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 49207e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 49307e43b41SHong Zhang } 49407e43b41SHong Zhang } 49507e43b41SHong Zhang } 49607e43b41SHong Zhang 497a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 49807e43b41SHong Zhang { 49907e43b41SHong Zhang __shared__ MatScalar shared[512]; 50007e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 50107e43b41SHong Zhang /* multiple threads per row. */ 50207e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 50307e43b41SHong Zhang if (row < nrows) { 50407e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 50507e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 50607e43b41SHong Zhang 50707e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 50807e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 50907e43b41SHong Zhang __syncthreads(); 51007e43b41SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 51107e43b41SHong Zhang __syncthreads(); 51207e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 51307e43b41SHong Zhang __syncthreads(); 51407e43b41SHong Zhang if (threadIdx.y < 1) { 51507e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 51607e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 51707e43b41SHong Zhang } 51807e43b41SHong Zhang } 51907e43b41SHong Zhang } 52007e43b41SHong Zhang 521a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 52207e43b41SHong Zhang { 52307e43b41SHong Zhang __shared__ MatScalar shared[512]; 52407e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 52507e43b41SHong Zhang /* multiple threads per row. */ 52607e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 52707e43b41SHong Zhang if (row < nrows) { 52807e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 52907e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 53007e43b41SHong Zhang 53107e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 53207e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 53307e43b41SHong Zhang __syncthreads(); 53407e43b41SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 53507e43b41SHong Zhang __syncthreads(); 53607e43b41SHong Zhang if (threadIdx.y < 1) { 53707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 53807e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 53907e43b41SHong Zhang } 54007e43b41SHong Zhang } 54107e43b41SHong Zhang } 54207e43b41SHong Zhang 543a9dd396cSHong Zhang __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 54407e43b41SHong Zhang { 54507e43b41SHong Zhang __shared__ MatScalar shared[512]; 54607e43b41SHong Zhang PetscInt i, row, slice_id, row_in_slice; 54707e43b41SHong Zhang /* multiple threads per row. */ 54807e43b41SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 54907e43b41SHong Zhang if (row < nrows) { 55007e43b41SHong Zhang slice_id = row / SLICE_HEIGHT; 55107e43b41SHong Zhang row_in_slice = row % SLICE_HEIGHT; 55207e43b41SHong Zhang 55307e43b41SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 55407e43b41SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 55507e43b41SHong Zhang __syncthreads(); 55607e43b41SHong Zhang if (threadIdx.y < 1) { 55707e43b41SHong Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 55807e43b41SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 55907e43b41SHong Zhang } 56007e43b41SHong Zhang } 5612d1451d4SHong Zhang } 5622d1451d4SHong Zhang 5632d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 5642d1451d4SHong Zhang { 5652d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 5662d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 5672d1451d4SHong Zhang PetscScalar *y; 5682d1451d4SHong Zhang const PetscScalar *x; 569a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 570*90d2215bSHong Zhang PetscInt chunksperblock, nchunks, *chunk_slice_map; 5712d1451d4SHong Zhang MatScalar *aval; 5722d1451d4SHong Zhang PetscInt *acolidx; 5732d1451d4SHong Zhang PetscInt *sliidx; 57407e43b41SHong Zhang PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 57507e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 5762d1451d4SHong Zhang 5772d1451d4SHong Zhang PetscFunctionBegin; 5784e58db63SHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 579*90d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 5802d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 5812d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 5822d1451d4SHong Zhang aval = cudastruct->val; 5832d1451d4SHong Zhang acolidx = cudastruct->colidx; 5842d1451d4SHong Zhang sliidx = cudastruct->sliidx; 5852d1451d4SHong Zhang 5862d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 5872d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y)); 5882d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 58907e43b41SHong Zhang 59007e43b41SHong Zhang switch (cudastruct->kernelchoice) { 59107e43b41SHong Zhang case 9: 5924e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 5934e58db63SHong Zhang if (cudastruct->blocky == 2) { 5944e58db63SHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 5954e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 5964e58db63SHong Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 5974e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 5984e58db63SHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 5994e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 6004e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6014e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 6024e58db63SHong Zhang matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 603cca9ff8bSHong Zhang } else { 604cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 60507e43b41SHong Zhang } 60607e43b41SHong Zhang break; 60707e43b41SHong Zhang case 7: 6084e58db63SHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 6094e58db63SHong Zhang if (cudastruct->blocky == 2) { 6104e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6114e58db63SHong Zhang } else if (cudastruct->blocky == 4) { 6124e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6134e58db63SHong Zhang } else if (cudastruct->blocky == 8) { 6144e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6154e58db63SHong Zhang } else if (cudastruct->blocky == 16) { 6164e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6174e58db63SHong Zhang } else if (cudastruct->blocky == 32) { 6184e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6194e58db63SHong Zhang } else { 6204e58db63SHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6214e58db63SHong Zhang } 62207e43b41SHong Zhang break; 62307e43b41SHong Zhang case 6: 62407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 625a9dd396cSHong Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 62607e43b41SHong Zhang break; 62707e43b41SHong Zhang case 5: 62807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 629a9dd396cSHong Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 63007e43b41SHong Zhang break; 63107e43b41SHong Zhang case 4: 63207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 633a9dd396cSHong Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 63407e43b41SHong Zhang break; 63507e43b41SHong Zhang case 3: 63607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 637a9dd396cSHong Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 63807e43b41SHong Zhang break; 63907e43b41SHong Zhang case 2: /* 16 slices per block if blocksize=512 */ 64007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 641a9dd396cSHong Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 64207e43b41SHong Zhang break; 64307e43b41SHong Zhang case 1: /* 32 slices per block if blocksize=512 */ 64407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 6454e58db63SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 64607e43b41SHong Zhang break; 64707e43b41SHong Zhang case 0: 648*90d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 649*90d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 650*90d2215bSHong Zhang /* each block handles approximately one slice */ 651*90d2215bSHong Zhang PetscInt blocky = a->chunksize / 32; 652*90d2215bSHong Zhang nchunks = cudastruct->totalchunks; 653*90d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 654*90d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 655*90d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 656*90d2215bSHong Zhang if (blocky == 2) { 657*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 658*90d2215bSHong Zhang } else if (blocky == 4) { 659*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 660*90d2215bSHong Zhang } else if (blocky == 8) { 661*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 662*90d2215bSHong Zhang } else if (blocky == 16) { 663*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 664*90d2215bSHong Zhang } else if (blocky == 32) { 665*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 666*90d2215bSHong Zhang } else { 667*90d2215bSHong Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 668*90d2215bSHong Zhang } 6692d1451d4SHong Zhang } else { 670cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 671*90d2215bSHong Zhang if (avgslicesize <= 432) { 672*90d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 673cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 674cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 675*90d2215bSHong Zhang } else { 676cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 677cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 678*90d2215bSHong Zhang } 679cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 680cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 681cca9ff8bSHong Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 682cca9ff8bSHong Zhang } else { 6834e58db63SHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 6844e58db63SHong Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 6852d1451d4SHong Zhang } 6862d1451d4SHong Zhang } 687cca9ff8bSHong Zhang break; 688cca9ff8bSHong Zhang } 6892d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 6902d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 6912d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 6922d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 6932d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 6942d1451d4SHong Zhang } 6952d1451d4SHong Zhang 6962d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 6972d1451d4SHong Zhang { 6982d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 6992d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 7002d1451d4SHong Zhang PetscScalar *z; 7012d1451d4SHong Zhang const PetscScalar *y, *x; 702a9dd396cSHong Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 703*90d2215bSHong Zhang PetscInt chunksperblock, nchunks, *chunk_slice_map; 7042d1451d4SHong Zhang MatScalar *aval = cudastruct->val; 7052d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx; 7062d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx; 707*90d2215bSHong Zhang PetscReal maxoveravg; 7082d1451d4SHong Zhang 7092d1451d4SHong Zhang PetscFunctionBegin; 710*90d2215bSHong Zhang PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 711*90d2215bSHong Zhang PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 7122d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 7132d1451d4SHong Zhang if (a->nz) { 714*90d2215bSHong Zhang PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512; 71507e43b41SHong Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 7162d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 7172d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y)); 7182d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z)); 7192d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 72007e43b41SHong Zhang 72107e43b41SHong Zhang switch (cudastruct->kernelchoice) { 722cca9ff8bSHong Zhang case 9: 723cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 724*90d2215bSHong Zhang if (blocky == 2) { 725cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 726*90d2215bSHong Zhang } else if (blocky == 4) { 727cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 728*90d2215bSHong Zhang } else if (blocky == 8) { 729cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 730*90d2215bSHong Zhang } else if (blocky == 16) { 731cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 732*90d2215bSHong Zhang } else if (blocky == 32) { 733cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 734cca9ff8bSHong Zhang } else { 735cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 736cca9ff8bSHong Zhang } 737cca9ff8bSHong Zhang break; 738*90d2215bSHong Zhang case 8: 739*90d2215bSHong Zhang /* each block handles approximately one slice */ 740*90d2215bSHong Zhang nchunks = cudastruct->totalchunks; 741*90d2215bSHong Zhang blocky = a->chunksize / 32; 742*90d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 743*90d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 744*90d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 745*90d2215bSHong Zhang if (blocky == 2) { 746*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 747*90d2215bSHong Zhang } else if (blocky == 4) { 748*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 749*90d2215bSHong Zhang } else if (blocky == 8) { 750*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 751*90d2215bSHong Zhang } else if (blocky == 16) { 752*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 753*90d2215bSHong Zhang } else if (blocky == 32) { 754*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 755*90d2215bSHong Zhang } else { 756*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 757*90d2215bSHong Zhang } 758*90d2215bSHong Zhang break; 759cca9ff8bSHong Zhang case 7: 760cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 761*90d2215bSHong Zhang if (blocky == 2) { 762cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 763*90d2215bSHong Zhang } else if (blocky == 4) { 764cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 765*90d2215bSHong Zhang } else if (blocky == 8) { 766cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 767*90d2215bSHong Zhang } else if (blocky == 16) { 768cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 769*90d2215bSHong Zhang } else if (blocky == 32) { 770cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 771cca9ff8bSHong Zhang } else { 772cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 773cca9ff8bSHong Zhang } 774cca9ff8bSHong Zhang break; 77507e43b41SHong Zhang case 6: 77607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); 777a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 77807e43b41SHong Zhang break; 77907e43b41SHong Zhang case 5: 78007e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); 781a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 78207e43b41SHong Zhang break; 78307e43b41SHong Zhang case 4: 78407e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); 785a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 78607e43b41SHong Zhang break; 78707e43b41SHong Zhang case 3: 78807e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); 789a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 79007e43b41SHong Zhang break; 79107e43b41SHong Zhang case 2: 79207e43b41SHong Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 793a9dd396cSHong Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 79407e43b41SHong Zhang break; 79507e43b41SHong Zhang case 1: 79607e43b41SHong Zhang nblocks = 1 + (nrows - 1) / blocksize; 7974e58db63SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 79807e43b41SHong Zhang break; 799cca9ff8bSHong Zhang case 0: 800*90d2215bSHong Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 801*90d2215bSHong Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 802*90d2215bSHong Zhang /* each block handles approximately one slice */ 803*90d2215bSHong Zhang nchunks = cudastruct->totalchunks; 804*90d2215bSHong Zhang blocky = a->chunksize / 32; 805*90d2215bSHong Zhang chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 806*90d2215bSHong Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 807*90d2215bSHong Zhang chunk_slice_map = cudastruct->chunk_slice_map; 808*90d2215bSHong Zhang if (blocky == 2) { 809*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 810*90d2215bSHong Zhang } else if (blocky == 4) { 811*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 812*90d2215bSHong Zhang } else if (blocky == 8) { 813*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 814*90d2215bSHong Zhang } else if (blocky == 16) { 815*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 816*90d2215bSHong Zhang } else if (blocky == 32) { 817*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 818*90d2215bSHong Zhang } else { 819*90d2215bSHong Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 820*90d2215bSHong Zhang } 821cca9ff8bSHong Zhang } else { 822cca9ff8bSHong Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 823*90d2215bSHong Zhang if (avgslicesize <= 432) { 824*90d2215bSHong Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 825cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 826cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 827*90d2215bSHong Zhang } else { 828cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 829cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 830*90d2215bSHong Zhang } 831cca9ff8bSHong Zhang } else if (avgslicesize <= 2400) { 832cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 833cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 834cca9ff8bSHong Zhang } else { 835cca9ff8bSHong Zhang nblocks = 1 + (nrows - 1) / sliceheight; 836cca9ff8bSHong Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 837cca9ff8bSHong Zhang } 838cca9ff8bSHong Zhang } 83907e43b41SHong Zhang break; 8402d1451d4SHong Zhang } 8412d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 8422d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 8432d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y)); 8442d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 8452d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 8462d1451d4SHong Zhang } else { 8472d1451d4SHong Zhang PetscCall(VecCopy(yy, zz)); 8482d1451d4SHong Zhang } 8492d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 8502d1451d4SHong Zhang } 8512d1451d4SHong Zhang 8522d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 8532d1451d4SHong Zhang { 85407e43b41SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 8554e58db63SHong Zhang PetscInt kernel, blocky; 85607e43b41SHong Zhang PetscBool flg; 85707e43b41SHong Zhang 8582d1451d4SHong Zhang PetscFunctionBegin; 8592d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 860*90d2215bSHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 861*90d2215bSHong Zhang if (flg) { 862*90d2215bSHong Zhang PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 863*90d2215bSHong Zhang cudastruct->blocky = blocky; 864*90d2215bSHong Zhang } 86507e43b41SHong Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 86607e43b41SHong Zhang if (flg) { 86707e43b41SHong Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 86807e43b41SHong Zhang cudastruct->kernelchoice = kernel; 869*90d2215bSHong Zhang if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 8704e58db63SHong Zhang } 8712d1451d4SHong Zhang PetscOptionsHeadEnd(); 8722d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 8732d1451d4SHong Zhang } 8742d1451d4SHong Zhang 87507e43b41SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 87607e43b41SHong Zhang { 87707e43b41SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 87807e43b41SHong Zhang 879*90d2215bSHong Zhang PetscFunctionBegin; 88007e43b41SHong Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 88107e43b41SHong Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 88207e43b41SHong Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 88307e43b41SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 88407e43b41SHong Zhang } 88507e43b41SHong Zhang 8862d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 8872d1451d4SHong Zhang { 8882d1451d4SHong Zhang PetscFunctionBegin; 8892d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 89007e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 8912d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 8922d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 8932d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA; 8942d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA; 8952d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 8962d1451d4SHong Zhang } 8972d1451d4SHong Zhang 8982d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 8992d1451d4SHong Zhang { 9002d1451d4SHong Zhang PetscFunctionBegin; 9012d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { 9022d1451d4SHong Zhang if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 9032d1451d4SHong Zhang } 9042d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A)); 9052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9062d1451d4SHong Zhang } 9072d1451d4SHong Zhang 9082d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 9092d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 9102d1451d4SHong Zhang { 9112d1451d4SHong Zhang PetscFunctionBegin; 9122d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 9132d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 9142d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9152d1451d4SHong Zhang } 9162d1451d4SHong Zhang 9172d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 9182d1451d4SHong Zhang { 9192d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct; 9202d1451d4SHong Zhang 9212d1451d4SHong Zhang PetscFunctionBegin; 9222d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype)); 9232d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 9242d1451d4SHong Zhang 9252d1451d4SHong Zhang if (!B->spptr) { 9262d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) { 9272d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct)); 9282d1451d4SHong Zhang B->spptr = cudastruct; 9292d1451d4SHong Zhang } 9302d1451d4SHong Zhang } 9312d1451d4SHong Zhang 9322d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 9332d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA; 9342d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 9352d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA; 9362d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA; 9372d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 9382d1451d4SHong Zhang 93907e43b41SHong Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 94007e43b41SHong Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 94107e43b41SHong Zhang 9422d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 9432d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 9442d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9452d1451d4SHong Zhang } 9462d1451d4SHong Zhang 9472d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 9482d1451d4SHong Zhang { 9492d1451d4SHong Zhang PetscFunctionBegin; 9502d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B)); 9512d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 9522d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 9532d1451d4SHong Zhang } 954