1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt maxallocmat; 10 PetscInt totalentries; 11 PetscInt *colidx; /* column index array, device pointer */ 12 MatScalar *val; /* value array, device pointer */ 13 PetscInt totalslices; 14 PetscInt *sliidx; /* slice index array, device pointer */ 15 PetscInt nonzerostate; 16 PetscInt kernelchoice; 17 PetscInt blocky; 18 PetscInt chunksperblock; 19 PetscInt totalchunks; 20 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 21 } Mat_SeqSELLCUDA; 22 23 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 24 { 25 PetscFunctionBegin; 26 if (*cudastruct) { 27 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 28 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 29 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 30 if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 31 PetscCall(PetscFree(*cudastruct)); 32 } 33 PetscFunctionReturn(PETSC_SUCCESS); 34 } 35 36 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 37 { 38 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 39 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 40 41 PetscFunctionBegin; 42 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 43 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 44 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 45 /* copy values only */ 46 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 47 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 48 } else { 49 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 50 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 51 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 52 if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 53 cudastruct->maxallocmat = a->maxallocmat; 54 cudastruct->totalentries = a->sliidx[a->totalslices]; 55 cudastruct->totalslices = a->totalslices; 56 cudastruct->totalchunks = a->totalchunks; 57 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 58 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 59 /* copy values, nz or maxallocmat? */ 60 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 61 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 62 63 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 64 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 65 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(PetscInt))); 66 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(PetscInt), cudaMemcpyHostToDevice)); 67 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 68 } 69 PetscCallCUDA(WaitForCUDA()); 70 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 71 A->offloadmask = PETSC_OFFLOAD_BOTH; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 77 { 78 PetscInt i, row, slice_id, row_in_slice; 79 MatScalar sum; 80 /* one thread per row. */ 81 row = blockIdx.x * blockDim.x + threadIdx.x; 82 if (row < nrows) { 83 slice_id = row / sliceheight; 84 row_in_slice = row % sliceheight; 85 sum = 0.0; 86 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 87 y[row] = sum; 88 } 89 } 90 91 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 92 { 93 PetscInt i, row, slice_id, row_in_slice; 94 MatScalar sum; 95 /* one thread per row. */ 96 row = blockIdx.x * blockDim.x + threadIdx.x; 97 if (row < nrows) { 98 slice_id = row / sliceheight; 99 row_in_slice = row % sliceheight; 100 sum = 0.0; 101 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 102 z[row] = y[row] + sum; 103 } 104 } 105 106 /* use 1 block per slice, suitable for large slice width */ 107 template <int BLOCKY> 108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 109 { 110 __shared__ MatScalar shared[32][BLOCKY]; 111 PetscInt i, row, slice_id = blockIdx.x; 112 int tid = threadIdx.x + threadIdx.y * 32; 113 /* transposed index */ 114 int tidx = tid % BLOCKY; 115 int tidy = tid / BLOCKY; 116 PetscScalar t = 0.0; 117 118 row = slice_id * sliceheight + threadIdx.x % sliceheight; 119 if (row < nrows) { 120 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 121 } 122 #pragma unroll 123 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 124 /* transpose layout to reduce each row using warp shfl */ 125 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 126 __syncthreads(); 127 if (tidy < sliceheight) t = shared[tidy][tidx]; 128 #pragma unroll 129 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 130 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 131 __syncthreads(); 132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 133 } 134 135 /* use 1 block per slice, suitable for large slice width */ 136 template <int BLOCKY> 137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 138 { 139 __shared__ MatScalar shared[32][BLOCKY]; 140 PetscInt i, row, slice_id = blockIdx.x; 141 int tid = threadIdx.x + threadIdx.y * 32; 142 /* transposed index */ 143 int tidx = tid % BLOCKY; 144 int tidy = tid / BLOCKY; 145 PetscScalar t = 0.0; 146 147 row = slice_id * sliceheight + threadIdx.x % sliceheight; 148 if (row < nrows) { 149 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 150 } 151 #pragma unroll 152 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 153 /* transpose layout to reduce each row using warp shfl */ 154 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 155 __syncthreads(); 156 if (tidy < sliceheight) t = shared[tidy][tidx]; 157 #pragma unroll 158 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 159 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 160 __syncthreads(); 161 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 162 } 163 164 template <int BLOCKY> 165 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 166 { 167 bool head = true; 168 #pragma unroll 169 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 170 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 171 shared[threadIdx.x + threadIdx.y * 32] = 0; 172 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 173 shared[threadIdx.x + threadIdx.y * 32] = *val; 174 if (i == 1) head = false; 175 } 176 __syncthreads(); 177 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 178 __syncthreads(); 179 } 180 return head; 181 } 182 183 /* load-balancing version. Chunksize is equal to the number of threads per block */ 184 template <int BLOCKY> 185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 186 { 187 __shared__ MatScalar shared[BLOCKY * 32]; 188 PetscInt gid, row, start_slice, cid; 189 PetscScalar t = 0.0; 190 /* zero out y */ 191 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 192 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 193 if (gid < nrows) y[gid] = 0.0; 194 } 195 for (int iter = 0; iter < chunksperblock; iter++) { 196 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 197 if (cid < totalchunks) { 198 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 199 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 200 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 201 __shared__ PetscInt flag[BLOCKY * 2]; 202 bool write; 203 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 204 /* find out the slice that this element belongs to */ 205 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 206 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 207 row = slice_id * sliceheight + threadIdx.x % sliceheight; 208 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 209 __syncthreads(); 210 write = segment_scan<BLOCKY>(flag, shared, &t); 211 if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t); 212 t = 0.0; 213 } else { /* this iteration covers only one slice */ 214 row = start_slice * sliceheight + threadIdx.x % sliceheight; 215 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 216 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 217 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 218 /* reduction and write to output vector */ 219 #pragma unroll 220 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 221 /* transpose layout to reduce each row using warp shfl */ 222 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 223 __syncthreads(); 224 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 225 #pragma unroll 226 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 227 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 228 __syncthreads(); 229 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 230 t = 0.0; 231 } 232 } 233 } 234 } 235 } 236 237 /* load-balancing version. Chunksize is equal to the number of threads per block */ 238 template <int BLOCKY> 239 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 240 { 241 __shared__ MatScalar shared[BLOCKY * 32]; 242 PetscInt gid, row, start_slice, cid; 243 PetscScalar t = 0.0; 244 /* copy y to z */ 245 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 246 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 247 if (gid < nrows) z[gid] = y[gid]; 248 } 249 for (int iter = 0; iter < chunksperblock; iter++) { 250 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 251 if (cid < totalchunks) { 252 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 253 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 254 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 255 __shared__ PetscInt flag[BLOCKY * 2]; 256 bool write; 257 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 258 /* find out the slice that this element belongs to */ 259 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 260 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 261 row = slice_id * sliceheight + threadIdx.x % sliceheight; 262 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 263 __syncthreads(); 264 write = segment_scan<BLOCKY>(flag, shared, &t); 265 if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t); 266 t = 0.0; 267 } else { /* this iteration covers only one slice */ 268 row = start_slice * sliceheight + threadIdx.x % sliceheight; 269 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 270 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 271 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 272 /* reduction and write to output vector */ 273 #pragma unroll 274 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 275 /* transpose layout to reduce each row using warp shfl */ 276 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 277 __syncthreads(); 278 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 279 #pragma unroll 280 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 281 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 282 __syncthreads(); 283 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 284 t = 0.0; 285 } 286 } 287 } 288 } 289 } 290 291 /* use 1 warp per slice, suitable for small slice width */ 292 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 293 { 294 PetscInt i, row, slice_id; 295 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 296 row = slice_id * sliceheight + threadIdx.x % sliceheight; 297 double t = 0.0; 298 if (row < nrows) { 299 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 300 } 301 #pragma unroll 302 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 303 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 304 } 305 306 /* use 1 warp per slice, suitable for small slice width */ 307 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 308 { 309 PetscInt i, row, slice_id; 310 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 311 row = slice_id * sliceheight + threadIdx.x % sliceheight; 312 double t = 0.0; 313 if (row < nrows) { 314 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 315 } 316 #pragma unroll 317 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 318 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 319 } 320 321 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 322 323 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 324 { 325 __shared__ MatScalar shared[512]; 326 PetscInt i, row, slice_id, row_in_slice; 327 /* multiple threads per row. */ 328 row = blockIdx.x * blockDim.x + threadIdx.x; 329 if (row < nrows) { 330 slice_id = row / SLICE_HEIGHT; 331 row_in_slice = row % SLICE_HEIGHT; 332 333 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 334 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 335 __syncthreads(); 336 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 337 __syncthreads(); 338 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 339 __syncthreads(); 340 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 341 __syncthreads(); 342 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 343 __syncthreads(); 344 if (threadIdx.y < 1) { 345 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 346 y[row] = shared[threadIdx.x]; 347 } 348 } 349 } 350 351 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 352 { 353 __shared__ MatScalar shared[512]; 354 PetscInt i, row, slice_id, row_in_slice; 355 /* multiple threads per row. */ 356 row = blockIdx.x * blockDim.x + threadIdx.x; 357 if (row < nrows) { 358 slice_id = row / SLICE_HEIGHT; 359 row_in_slice = row % SLICE_HEIGHT; 360 361 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 362 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 363 __syncthreads(); 364 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 365 __syncthreads(); 366 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 367 __syncthreads(); 368 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 369 __syncthreads(); 370 if (threadIdx.y < 1) { 371 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 372 y[row] = shared[threadIdx.x]; 373 } 374 } 375 } 376 377 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 378 { 379 __shared__ MatScalar shared[512]; 380 PetscInt i, row, slice_id, row_in_slice; 381 /* multiple threads per row. */ 382 row = blockIdx.x * blockDim.x + threadIdx.x; 383 if (row < nrows) { 384 slice_id = row / SLICE_HEIGHT; 385 row_in_slice = row % SLICE_HEIGHT; 386 387 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 388 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 389 __syncthreads(); 390 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 391 __syncthreads(); 392 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 393 __syncthreads(); 394 if (threadIdx.y < 1) { 395 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 396 y[row] = shared[threadIdx.x]; 397 } 398 } 399 } 400 401 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 402 { 403 __shared__ MatScalar shared[512]; 404 PetscInt i, row, slice_id, row_in_slice; 405 /* multiple threads per row. */ 406 row = blockIdx.x * blockDim.x + threadIdx.x; 407 if (row < nrows) { 408 slice_id = row / SLICE_HEIGHT; 409 row_in_slice = row % SLICE_HEIGHT; 410 411 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 412 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 413 __syncthreads(); 414 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 415 __syncthreads(); 416 if (threadIdx.y < 1) { 417 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 418 y[row] = shared[threadIdx.x]; 419 } 420 } 421 } 422 423 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 424 { 425 __shared__ MatScalar shared[512]; 426 PetscInt i, row, slice_id, row_in_slice; 427 /* multiple threads per row. */ 428 row = blockIdx.x * blockDim.x + threadIdx.x; 429 if (row < nrows) { 430 slice_id = row / SLICE_HEIGHT; 431 row_in_slice = row % SLICE_HEIGHT; 432 433 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 434 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 435 __syncthreads(); 436 if (threadIdx.y < 1) { 437 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 438 y[row] = shared[threadIdx.x]; 439 } 440 } 441 } 442 443 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 444 { 445 __shared__ MatScalar shared[512]; 446 PetscInt i, row, slice_id, row_in_slice; 447 /* multiple threads per row. */ 448 row = blockIdx.x * blockDim.x + threadIdx.x; 449 if (row < nrows) { 450 slice_id = row / SLICE_HEIGHT; 451 row_in_slice = row % SLICE_HEIGHT; 452 453 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 454 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 455 __syncthreads(); 456 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 457 __syncthreads(); 458 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 459 __syncthreads(); 460 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 461 __syncthreads(); 462 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 463 __syncthreads(); 464 if (threadIdx.y < 1) { 465 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 466 z[row] = y[row] + shared[threadIdx.x]; 467 } 468 } 469 } 470 471 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 472 { 473 __shared__ MatScalar shared[512]; 474 PetscInt i, row, slice_id, row_in_slice; 475 /* multiple threads per row. */ 476 row = blockIdx.x * blockDim.x + threadIdx.x; 477 if (row < nrows) { 478 slice_id = row / SLICE_HEIGHT; 479 row_in_slice = row % SLICE_HEIGHT; 480 481 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 482 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 483 __syncthreads(); 484 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 485 __syncthreads(); 486 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 487 __syncthreads(); 488 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 489 __syncthreads(); 490 if (threadIdx.y < 1) { 491 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 492 z[row] = y[row] + shared[threadIdx.x]; 493 } 494 } 495 } 496 497 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 498 { 499 __shared__ MatScalar shared[512]; 500 PetscInt i, row, slice_id, row_in_slice; 501 /* multiple threads per row. */ 502 row = blockIdx.x * blockDim.x + threadIdx.x; 503 if (row < nrows) { 504 slice_id = row / SLICE_HEIGHT; 505 row_in_slice = row % SLICE_HEIGHT; 506 507 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 508 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 509 __syncthreads(); 510 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 511 __syncthreads(); 512 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 513 __syncthreads(); 514 if (threadIdx.y < 1) { 515 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 516 z[row] = y[row] + shared[threadIdx.x]; 517 } 518 } 519 } 520 521 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 522 { 523 __shared__ MatScalar shared[512]; 524 PetscInt i, row, slice_id, row_in_slice; 525 /* multiple threads per row. */ 526 row = blockIdx.x * blockDim.x + threadIdx.x; 527 if (row < nrows) { 528 slice_id = row / SLICE_HEIGHT; 529 row_in_slice = row % SLICE_HEIGHT; 530 531 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 532 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 533 __syncthreads(); 534 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 535 __syncthreads(); 536 if (threadIdx.y < 1) { 537 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 538 z[row] = y[row] + shared[threadIdx.x]; 539 } 540 } 541 } 542 543 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 544 { 545 __shared__ MatScalar shared[512]; 546 PetscInt i, row, slice_id, row_in_slice; 547 /* multiple threads per row. */ 548 row = blockIdx.x * blockDim.x + threadIdx.x; 549 if (row < nrows) { 550 slice_id = row / SLICE_HEIGHT; 551 row_in_slice = row % SLICE_HEIGHT; 552 553 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 554 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 555 __syncthreads(); 556 if (threadIdx.y < 1) { 557 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 558 z[row] = y[row] + shared[threadIdx.x]; 559 } 560 } 561 } 562 563 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 564 { 565 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 566 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 567 PetscScalar *y; 568 const PetscScalar *x; 569 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 570 PetscInt chunksperblock, nchunks, *chunk_slice_map; 571 MatScalar *aval; 572 PetscInt *acolidx; 573 PetscInt *sliidx; 574 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 575 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 576 PetscReal maxoveravg; 577 578 PetscFunctionBegin; 579 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 580 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 581 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 582 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 583 aval = cudastruct->val; 584 acolidx = cudastruct->colidx; 585 sliidx = cudastruct->sliidx; 586 587 PetscCall(VecCUDAGetArrayRead(xx, &x)); 588 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 589 PetscCall(PetscLogGpuTimeBegin()); 590 591 switch (cudastruct->kernelchoice) { 592 case 9: 593 nblocks = 1 + (nrows - 1) / sliceheight; 594 if (cudastruct->blocky == 2) { 595 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 596 } else if (cudastruct->blocky == 4) { 597 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 598 } else if (cudastruct->blocky == 8) { 599 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 600 } else if (cudastruct->blocky == 16) { 601 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 602 } else if (cudastruct->blocky == 32) { 603 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 604 } else { 605 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 606 } 607 break; 608 case 7: 609 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 610 if (cudastruct->blocky == 2) { 611 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 612 } else if (cudastruct->blocky == 4) { 613 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 614 } else if (cudastruct->blocky == 8) { 615 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 616 } else if (cudastruct->blocky == 16) { 617 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 618 } else if (cudastruct->blocky == 32) { 619 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 620 } else { 621 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 622 } 623 break; 624 case 6: 625 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 626 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 627 break; 628 case 5: 629 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 630 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 631 break; 632 case 4: 633 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 634 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 635 break; 636 case 3: 637 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 638 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 639 break; 640 case 2: /* 16 slices per block if blocksize=512 */ 641 nblocks = 1 + (nrows - 1) / (blocksize / 2); 642 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 643 break; 644 case 1: /* 32 slices per block if blocksize=512 */ 645 nblocks = 1 + (nrows - 1) / blocksize; 646 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 647 break; 648 case 0: 649 maxoveravg = a->maxslicewidth / a->avgslicewidth; 650 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 651 /* each block handles approximately one slice */ 652 PetscInt blocky = a->chunksize / 32; 653 nchunks = cudastruct->totalchunks; 654 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 655 nblocks = 1 + (nchunks - 1) / chunksperblock; 656 chunk_slice_map = cudastruct->chunk_slice_map; 657 if (blocky == 2) { 658 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 659 } else if (blocky == 4) { 660 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 661 } else if (blocky == 8) { 662 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 663 } else if (blocky == 16) { 664 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 665 } else if (blocky == 32) { 666 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 667 } else { 668 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669 } 670 } else { 671 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 672 if (avgslicesize <= 432) { 673 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 674 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 675 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 676 } else { 677 nblocks = 1 + (nrows - 1) / sliceheight; 678 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 679 } 680 } else if (avgslicesize <= 2400) { 681 nblocks = 1 + (nrows - 1) / sliceheight; 682 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 683 } else { 684 nblocks = 1 + (nrows - 1) / sliceheight; 685 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 686 } 687 } 688 break; 689 } 690 PetscCall(PetscLogGpuTimeEnd()); 691 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 692 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 693 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 694 PetscFunctionReturn(PETSC_SUCCESS); 695 } 696 697 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 698 { 699 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 700 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 701 PetscScalar *z; 702 const PetscScalar *y, *x; 703 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 704 PetscInt chunksperblock, nchunks, *chunk_slice_map; 705 MatScalar *aval = cudastruct->val; 706 PetscInt *acolidx = cudastruct->colidx; 707 PetscInt *sliidx = cudastruct->sliidx; 708 PetscReal maxoveravg; 709 710 PetscFunctionBegin; 711 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 712 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 713 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 714 if (a->nz) { 715 PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512; 716 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 717 PetscCall(VecCUDAGetArrayRead(xx, &x)); 718 PetscCall(VecCUDAGetArrayRead(yy, &y)); 719 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 720 PetscCall(PetscLogGpuTimeBegin()); 721 722 switch (cudastruct->kernelchoice) { 723 case 9: 724 nblocks = 1 + (nrows - 1) / sliceheight; 725 if (blocky == 2) { 726 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 727 } else if (blocky == 4) { 728 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 729 } else if (blocky == 8) { 730 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 731 } else if (blocky == 16) { 732 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 733 } else if (blocky == 32) { 734 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 735 } else { 736 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 737 } 738 break; 739 case 8: 740 /* each block handles approximately one slice */ 741 nchunks = cudastruct->totalchunks; 742 blocky = a->chunksize / 32; 743 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 744 nblocks = 1 + (nchunks - 1) / chunksperblock; 745 chunk_slice_map = cudastruct->chunk_slice_map; 746 if (blocky == 2) { 747 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 748 } else if (blocky == 4) { 749 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 750 } else if (blocky == 8) { 751 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 752 } else if (blocky == 16) { 753 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 754 } else if (blocky == 32) { 755 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 756 } else { 757 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 758 } 759 break; 760 case 7: 761 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 762 if (blocky == 2) { 763 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 764 } else if (blocky == 4) { 765 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 766 } else if (blocky == 8) { 767 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 768 } else if (blocky == 16) { 769 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 770 } else if (blocky == 32) { 771 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 772 } else { 773 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 774 } 775 break; 776 case 6: 777 nblocks = 1 + (nrows - 1) / (blocksize / 32); 778 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 779 break; 780 case 5: 781 nblocks = 1 + (nrows - 1) / (blocksize / 16); 782 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 783 break; 784 case 4: 785 nblocks = 1 + (nrows - 1) / (blocksize / 8); 786 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 787 break; 788 case 3: 789 nblocks = 1 + (nrows - 1) / (blocksize / 4); 790 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 791 break; 792 case 2: 793 nblocks = 1 + (nrows - 1) / (blocksize / 2); 794 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 795 break; 796 case 1: 797 nblocks = 1 + (nrows - 1) / blocksize; 798 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 799 break; 800 case 0: 801 maxoveravg = a->maxslicewidth / a->avgslicewidth; 802 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 803 /* each block handles approximately one slice */ 804 nchunks = cudastruct->totalchunks; 805 blocky = a->chunksize / 32; 806 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 807 nblocks = 1 + (nchunks - 1) / chunksperblock; 808 chunk_slice_map = cudastruct->chunk_slice_map; 809 if (blocky == 2) { 810 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 811 } else if (blocky == 4) { 812 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 813 } else if (blocky == 8) { 814 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 815 } else if (blocky == 16) { 816 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 817 } else if (blocky == 32) { 818 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 819 } else { 820 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 821 } 822 } else { 823 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 824 if (avgslicesize <= 432) { 825 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 826 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 827 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 828 } else { 829 nblocks = 1 + (nrows - 1) / sliceheight; 830 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 831 } 832 } else if (avgslicesize <= 2400) { 833 nblocks = 1 + (nrows - 1) / sliceheight; 834 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 835 } else { 836 nblocks = 1 + (nrows - 1) / sliceheight; 837 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 838 } 839 } 840 break; 841 } 842 PetscCall(PetscLogGpuTimeEnd()); 843 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 844 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 845 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 846 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 847 } else { 848 PetscCall(VecCopy(yy, zz)); 849 } 850 PetscFunctionReturn(PETSC_SUCCESS); 851 } 852 853 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 854 { 855 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 856 PetscInt kernel, blocky; 857 PetscBool flg; 858 859 PetscFunctionBegin; 860 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 861 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 862 if (flg) { 863 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 864 cudastruct->blocky = blocky; 865 } 866 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 867 if (flg) { 868 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 869 cudastruct->kernelchoice = kernel; 870 if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 871 } 872 PetscOptionsHeadEnd(); 873 PetscFunctionReturn(PETSC_SUCCESS); 874 } 875 876 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 877 { 878 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 879 880 PetscFunctionBegin; 881 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 882 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 883 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 884 PetscFunctionReturn(PETSC_SUCCESS); 885 } 886 887 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 888 { 889 PetscFunctionBegin; 890 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 891 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 892 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 893 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 894 A->ops->mult = MatMult_SeqSELLCUDA; 895 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 896 PetscFunctionReturn(PETSC_SUCCESS); 897 } 898 899 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 900 { 901 PetscFunctionBegin; 902 if (A->factortype == MAT_FACTOR_NONE) { 903 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 904 } 905 PetscCall(MatDestroy_SeqSELL(A)); 906 PetscFunctionReturn(PETSC_SUCCESS); 907 } 908 909 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 910 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 911 { 912 PetscFunctionBegin; 913 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 914 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 915 PetscFunctionReturn(PETSC_SUCCESS); 916 } 917 918 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 919 { 920 Mat_SeqSELLCUDA *cudastruct; 921 922 PetscFunctionBegin; 923 PetscCall(PetscFree(B->defaultvectype)); 924 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 925 926 if (!B->spptr) { 927 if (B->factortype == MAT_FACTOR_NONE) { 928 PetscCall(PetscNew(&cudastruct)); 929 B->spptr = cudastruct; 930 } 931 } 932 933 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 934 B->ops->destroy = MatDestroy_SeqSELLCUDA; 935 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 936 B->ops->mult = MatMult_SeqSELLCUDA; 937 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 938 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 939 940 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 941 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 942 943 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 944 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 945 PetscFunctionReturn(PETSC_SUCCESS); 946 } 947 948 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 949 { 950 PetscFunctionBegin; 951 PetscCall(MatCreate_SeqSELL(B)); 952 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 953 PetscFunctionReturn(PETSC_SUCCESS); 954 } 955