1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt maxallocmat; 10 PetscInt totalentries; 11 PetscInt *colidx; /* column index array, device pointer */ 12 MatScalar *val; /* value array, device pointer */ 13 PetscInt totalslices; 14 PetscInt *sliidx; /* slice index array, device pointer */ 15 PetscInt nonzerostate; 16 PetscInt kernelchoice; 17 PetscInt blocky; 18 PetscInt chunksperblock; 19 PetscInt totalchunks; 20 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 21 } Mat_SeqSELLCUDA; 22 23 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 24 { 25 PetscFunctionBegin; 26 if (*cudastruct) { 27 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 28 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 29 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 30 if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 31 PetscCall(PetscFree(*cudastruct)); 32 } 33 PetscFunctionReturn(PETSC_SUCCESS); 34 } 35 36 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 37 { 38 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 39 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 40 41 PetscFunctionBegin; 42 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 43 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 44 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 45 /* copy values only */ 46 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 47 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 48 } else { 49 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 50 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 51 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 52 if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 53 cudastruct->maxallocmat = a->maxallocmat; 54 cudastruct->totalentries = a->sliidx[a->totalslices]; 55 cudastruct->totalslices = a->totalslices; 56 cudastruct->totalchunks = a->totalchunks; 57 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 58 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 59 /* copy values, nz or maxallocmat? */ 60 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 61 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 62 63 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 64 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 65 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(PetscInt))); 66 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(PetscInt), cudaMemcpyHostToDevice)); 67 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 68 } 69 PetscCallCUDA(WaitForCUDA()); 70 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 71 A->offloadmask = PETSC_OFFLOAD_BOTH; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 77 { 78 PetscInt i, row, slice_id, row_in_slice; 79 MatScalar sum; 80 /* one thread per row. */ 81 row = blockIdx.x * blockDim.x + threadIdx.x; 82 if (row < nrows) { 83 slice_id = row / sliceheight; 84 row_in_slice = row % sliceheight; 85 sum = 0.0; 86 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 87 y[row] = sum; 88 } 89 } 90 91 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 92 { 93 PetscInt i, row, slice_id, row_in_slice; 94 MatScalar sum; 95 /* one thread per row. */ 96 row = blockIdx.x * blockDim.x + threadIdx.x; 97 if (row < nrows) { 98 slice_id = row / sliceheight; 99 row_in_slice = row % sliceheight; 100 sum = 0.0; 101 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 102 z[row] = y[row] + sum; 103 } 104 } 105 106 /* use 1 block per slice, suitable for large slice width */ 107 template <int BLOCKY> 108 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 109 { 110 __shared__ MatScalar shared[32][BLOCKY]; 111 PetscInt i, row, slice_id = blockIdx.x; 112 int tid = threadIdx.x + threadIdx.y * 32; 113 /* transposed index */ 114 int tidx = tid % BLOCKY; 115 int tidy = tid / BLOCKY; 116 PetscScalar t = 0.0; 117 118 row = slice_id * sliceheight + threadIdx.x % sliceheight; 119 if (row < nrows) { 120 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 121 } 122 #pragma unroll 123 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 124 /* transpose layout to reduce each row using warp shfl */ 125 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 126 __syncthreads(); 127 if (tidy < sliceheight) t = shared[tidy][tidx]; 128 #pragma unroll 129 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 130 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 131 __syncthreads(); 132 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 133 } 134 135 /* use 1 block per slice, suitable for large slice width */ 136 template <int BLOCKY> 137 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 138 { 139 __shared__ MatScalar shared[32][BLOCKY]; 140 PetscInt i, row, slice_id = blockIdx.x; 141 int tid = threadIdx.x + threadIdx.y * 32; 142 /* transposed index */ 143 int tidx = tid % BLOCKY; 144 int tidy = tid / BLOCKY; 145 PetscScalar t = 0.0; 146 147 row = slice_id * sliceheight + threadIdx.x % sliceheight; 148 if (row < nrows) { 149 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 150 } 151 #pragma unroll 152 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 153 /* transpose layout to reduce each row using warp shfl */ 154 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 155 __syncthreads(); 156 if (tidy < sliceheight) t = shared[tidy][tidx]; 157 #pragma unroll 158 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 159 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 160 __syncthreads(); 161 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 162 } 163 164 template <int BLOCKY> 165 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 166 { 167 bool head = true; 168 #pragma unroll 169 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 170 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 171 shared[threadIdx.x + threadIdx.y * 32] = 0; 172 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 173 shared[threadIdx.x + threadIdx.y * 32] = *val; 174 if (i == 1) head = false; 175 } 176 __syncthreads(); 177 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 178 __syncthreads(); 179 } 180 return head; 181 } 182 183 /* load-balancing version. Chunksize is equal to the number of threads per block */ 184 template <int BLOCKY> 185 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 186 { 187 __shared__ MatScalar shared[BLOCKY * 32]; 188 PetscInt gid, row, start_slice, cid; 189 PetscScalar t = 0.0; 190 /* zero out y */ 191 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 192 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 193 if (gid < nrows) y[gid] = 0.0; 194 } 195 for (int iter = 0; iter < chunksperblock; iter++) { 196 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 197 if (cid < totalchunks) { 198 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 199 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 200 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 201 __shared__ PetscInt flag[BLOCKY * 2]; 202 bool write; 203 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 204 /* find out the slice that this element belongs to */ 205 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 206 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 207 row = slice_id * sliceheight + threadIdx.x % sliceheight; 208 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 209 __syncthreads(); 210 write = segment_scan<BLOCKY>(flag, shared, &t); 211 if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t); 212 t = 0.0; 213 } else { /* this iteration covers only one slice */ 214 row = start_slice * sliceheight + threadIdx.x % sliceheight; 215 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 216 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 217 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 218 /* reduction and write to output vector */ 219 #pragma unroll 220 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 221 /* transpose layout to reduce each row using warp shfl */ 222 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 223 __syncthreads(); 224 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 225 #pragma unroll 226 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 227 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 228 __syncthreads(); 229 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 230 t = 0.0; 231 } 232 } 233 } 234 } 235 } 236 237 /* load-balancing version. Chunksize is equal to the number of threads per block */ 238 template <int BLOCKY> 239 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 240 { 241 __shared__ MatScalar shared[BLOCKY * 32]; 242 PetscInt gid, row, start_slice, cid; 243 PetscScalar t = 0.0; 244 /* copy y to z */ 245 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 246 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 247 if (gid < nrows) z[gid] = y[gid]; 248 } 249 for (int iter = 0; iter < chunksperblock; iter++) { 250 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 251 if (cid < totalchunks) { 252 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 253 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 254 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 255 __shared__ PetscInt flag[BLOCKY * 2]; 256 bool write; 257 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 258 /* find out the slice that this element belongs to */ 259 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 260 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 261 row = slice_id * sliceheight + threadIdx.x % sliceheight; 262 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 263 __syncthreads(); 264 write = segment_scan<BLOCKY>(flag, shared, &t); 265 if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t); 266 t = 0.0; 267 } else { /* this iteration covers only one slice */ 268 row = start_slice * sliceheight + threadIdx.x % sliceheight; 269 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 270 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 271 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 272 /* reduction and write to output vector */ 273 #pragma unroll 274 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 275 /* transpose layout to reduce each row using warp shfl */ 276 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 277 __syncthreads(); 278 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 279 #pragma unroll 280 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 281 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 282 __syncthreads(); 283 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 284 t = 0.0; 285 } 286 } 287 } 288 } 289 } 290 291 /* use 1 warp per slice, suitable for small slice width */ 292 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 293 { 294 PetscInt i, row, slice_id; 295 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 296 row = slice_id * sliceheight + threadIdx.x % sliceheight; 297 double t = 0.0; 298 if (row < nrows) { 299 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 300 } 301 #pragma unroll 302 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 303 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 304 } 305 306 /* use 1 warp per slice, suitable for small slice width */ 307 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 308 { 309 PetscInt i, row, slice_id; 310 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 311 row = slice_id * sliceheight + threadIdx.x % sliceheight; 312 double t = 0.0; 313 if (row < nrows) { 314 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 315 } 316 #pragma unroll 317 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 318 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 319 } 320 321 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 322 323 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 324 { 325 __shared__ MatScalar shared[512]; 326 PetscInt i, row, slice_id, row_in_slice; 327 /* multiple threads per row. */ 328 row = blockIdx.x * blockDim.x + threadIdx.x; 329 if (row < nrows) { 330 slice_id = row / SLICE_HEIGHT; 331 row_in_slice = row % SLICE_HEIGHT; 332 333 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 334 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 335 __syncthreads(); 336 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 337 __syncthreads(); 338 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 339 __syncthreads(); 340 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 341 __syncthreads(); 342 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 343 __syncthreads(); 344 if (threadIdx.y < 1) { 345 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 346 y[row] = shared[threadIdx.x]; 347 } 348 } 349 } 350 351 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 352 { 353 __shared__ MatScalar shared[512]; 354 PetscInt i, row, slice_id, row_in_slice; 355 /* multiple threads per row. */ 356 row = blockIdx.x * blockDim.x + threadIdx.x; 357 if (row < nrows) { 358 slice_id = row / SLICE_HEIGHT; 359 row_in_slice = row % SLICE_HEIGHT; 360 361 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 362 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 363 __syncthreads(); 364 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 365 __syncthreads(); 366 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 367 __syncthreads(); 368 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 369 __syncthreads(); 370 if (threadIdx.y < 1) { 371 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 372 y[row] = shared[threadIdx.x]; 373 } 374 } 375 } 376 377 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 378 { 379 __shared__ MatScalar shared[512]; 380 PetscInt i, row, slice_id, row_in_slice; 381 /* multiple threads per row. */ 382 row = blockIdx.x * blockDim.x + threadIdx.x; 383 if (row < nrows) { 384 slice_id = row / SLICE_HEIGHT; 385 row_in_slice = row % SLICE_HEIGHT; 386 387 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 388 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 389 __syncthreads(); 390 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 391 __syncthreads(); 392 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 393 __syncthreads(); 394 if (threadIdx.y < 1) { 395 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 396 y[row] = shared[threadIdx.x]; 397 } 398 } 399 } 400 401 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 402 { 403 __shared__ MatScalar shared[512]; 404 PetscInt i, row, slice_id, row_in_slice; 405 /* multiple threads per row. */ 406 row = blockIdx.x * blockDim.x + threadIdx.x; 407 if (row < nrows) { 408 slice_id = row / SLICE_HEIGHT; 409 row_in_slice = row % SLICE_HEIGHT; 410 411 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 412 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 413 __syncthreads(); 414 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 415 __syncthreads(); 416 if (threadIdx.y < 1) { 417 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 418 y[row] = shared[threadIdx.x]; 419 } 420 } 421 } 422 423 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 424 { 425 __shared__ MatScalar shared[512]; 426 PetscInt i, row, slice_id, row_in_slice; 427 /* multiple threads per row. */ 428 row = blockIdx.x * blockDim.x + threadIdx.x; 429 if (row < nrows) { 430 slice_id = row / SLICE_HEIGHT; 431 row_in_slice = row % SLICE_HEIGHT; 432 433 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 434 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 435 __syncthreads(); 436 if (threadIdx.y < 1) { 437 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 438 y[row] = shared[threadIdx.x]; 439 } 440 } 441 } 442 443 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 444 { 445 __shared__ MatScalar shared[512]; 446 PetscInt i, row, slice_id, row_in_slice; 447 /* multiple threads per row. */ 448 row = blockIdx.x * blockDim.x + threadIdx.x; 449 if (row < nrows) { 450 slice_id = row / SLICE_HEIGHT; 451 row_in_slice = row % SLICE_HEIGHT; 452 453 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 454 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 455 __syncthreads(); 456 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 457 __syncthreads(); 458 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 459 __syncthreads(); 460 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 461 __syncthreads(); 462 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 463 __syncthreads(); 464 if (threadIdx.y < 1) { 465 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 466 z[row] = y[row] + shared[threadIdx.x]; 467 } 468 } 469 } 470 471 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 472 { 473 __shared__ MatScalar shared[512]; 474 PetscInt i, row, slice_id, row_in_slice; 475 /* multiple threads per row. */ 476 row = blockIdx.x * blockDim.x + threadIdx.x; 477 if (row < nrows) { 478 slice_id = row / SLICE_HEIGHT; 479 row_in_slice = row % SLICE_HEIGHT; 480 481 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 482 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 483 __syncthreads(); 484 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 485 __syncthreads(); 486 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 487 __syncthreads(); 488 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 489 __syncthreads(); 490 if (threadIdx.y < 1) { 491 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 492 z[row] = y[row] + shared[threadIdx.x]; 493 } 494 } 495 } 496 497 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 498 { 499 __shared__ MatScalar shared[512]; 500 PetscInt i, row, slice_id, row_in_slice; 501 /* multiple threads per row. */ 502 row = blockIdx.x * blockDim.x + threadIdx.x; 503 if (row < nrows) { 504 slice_id = row / SLICE_HEIGHT; 505 row_in_slice = row % SLICE_HEIGHT; 506 507 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 508 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 509 __syncthreads(); 510 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 511 __syncthreads(); 512 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 513 __syncthreads(); 514 if (threadIdx.y < 1) { 515 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 516 z[row] = y[row] + shared[threadIdx.x]; 517 } 518 } 519 } 520 521 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 522 { 523 __shared__ MatScalar shared[512]; 524 PetscInt i, row, slice_id, row_in_slice; 525 /* multiple threads per row. */ 526 row = blockIdx.x * blockDim.x + threadIdx.x; 527 if (row < nrows) { 528 slice_id = row / SLICE_HEIGHT; 529 row_in_slice = row % SLICE_HEIGHT; 530 531 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 532 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 533 __syncthreads(); 534 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 535 __syncthreads(); 536 if (threadIdx.y < 1) { 537 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 538 z[row] = y[row] + shared[threadIdx.x]; 539 } 540 } 541 } 542 543 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 544 { 545 __shared__ MatScalar shared[512]; 546 PetscInt i, row, slice_id, row_in_slice; 547 /* multiple threads per row. */ 548 row = blockIdx.x * blockDim.x + threadIdx.x; 549 if (row < nrows) { 550 slice_id = row / SLICE_HEIGHT; 551 row_in_slice = row % SLICE_HEIGHT; 552 553 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 554 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 555 __syncthreads(); 556 if (threadIdx.y < 1) { 557 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 558 z[row] = y[row] + shared[threadIdx.x]; 559 } 560 } 561 } 562 563 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 564 { 565 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 566 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 567 PetscScalar *y; 568 const PetscScalar *x; 569 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 570 PetscInt chunksperblock, nchunks, *chunk_slice_map; 571 MatScalar *aval; 572 PetscInt *acolidx; 573 PetscInt *sliidx; 574 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 575 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 576 577 PetscFunctionBegin; 578 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 579 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 580 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 581 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 582 aval = cudastruct->val; 583 acolidx = cudastruct->colidx; 584 sliidx = cudastruct->sliidx; 585 586 PetscCall(VecCUDAGetArrayRead(xx, &x)); 587 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 588 PetscCall(PetscLogGpuTimeBegin()); 589 590 switch (cudastruct->kernelchoice) { 591 case 9: 592 nblocks = 1 + (nrows - 1) / sliceheight; 593 if (cudastruct->blocky == 2) { 594 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 595 } else if (cudastruct->blocky == 4) { 596 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 597 } else if (cudastruct->blocky == 8) { 598 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 599 } else if (cudastruct->blocky == 16) { 600 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 601 } else if (cudastruct->blocky == 32) { 602 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 603 } else { 604 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 605 } 606 break; 607 case 7: 608 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 609 if (cudastruct->blocky == 2) { 610 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 611 } else if (cudastruct->blocky == 4) { 612 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 613 } else if (cudastruct->blocky == 8) { 614 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 615 } else if (cudastruct->blocky == 16) { 616 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 617 } else if (cudastruct->blocky == 32) { 618 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 619 } else { 620 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 621 } 622 break; 623 case 6: 624 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 625 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 626 break; 627 case 5: 628 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 629 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 630 break; 631 case 4: 632 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 633 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 634 break; 635 case 3: 636 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 637 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 638 break; 639 case 2: /* 16 slices per block if blocksize=512 */ 640 nblocks = 1 + (nrows - 1) / (blocksize / 2); 641 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 642 break; 643 case 1: /* 32 slices per block if blocksize=512 */ 644 nblocks = 1 + (nrows - 1) / blocksize; 645 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 646 break; 647 case 0: 648 maxoveravg = a->maxslicewidth / a->avgslicewidth; 649 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 650 /* each block handles approximately one slice */ 651 PetscInt blocky = a->chunksize / 32; 652 nchunks = cudastruct->totalchunks; 653 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 654 nblocks = 1 + (nchunks - 1) / chunksperblock; 655 chunk_slice_map = cudastruct->chunk_slice_map; 656 if (blocky == 2) { 657 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 658 } else if (blocky == 4) { 659 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 660 } else if (blocky == 8) { 661 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 662 } else if (blocky == 16) { 663 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 664 } else if (blocky == 32) { 665 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 666 } else { 667 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 668 } 669 } else { 670 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 671 if (avgslicesize <= 432) { 672 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 673 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 674 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 675 } else { 676 nblocks = 1 + (nrows - 1) / sliceheight; 677 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 678 } 679 } else if (avgslicesize <= 2400) { 680 nblocks = 1 + (nrows - 1) / sliceheight; 681 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 682 } else { 683 nblocks = 1 + (nrows - 1) / sliceheight; 684 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 685 } 686 } 687 break; 688 } 689 PetscCall(PetscLogGpuTimeEnd()); 690 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 691 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 692 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 693 PetscFunctionReturn(PETSC_SUCCESS); 694 } 695 696 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 697 { 698 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 699 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 700 PetscScalar *z; 701 const PetscScalar *y, *x; 702 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 703 PetscInt chunksperblock, nchunks, *chunk_slice_map; 704 MatScalar *aval = cudastruct->val; 705 PetscInt *acolidx = cudastruct->colidx; 706 PetscInt *sliidx = cudastruct->sliidx; 707 PetscReal maxoveravg; 708 709 PetscFunctionBegin; 710 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 711 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 712 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 713 if (a->nz) { 714 PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512; 715 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 716 PetscCall(VecCUDAGetArrayRead(xx, &x)); 717 PetscCall(VecCUDAGetArrayRead(yy, &y)); 718 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 719 PetscCall(PetscLogGpuTimeBegin()); 720 721 switch (cudastruct->kernelchoice) { 722 case 9: 723 nblocks = 1 + (nrows - 1) / sliceheight; 724 if (blocky == 2) { 725 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 726 } else if (blocky == 4) { 727 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 728 } else if (blocky == 8) { 729 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 730 } else if (blocky == 16) { 731 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 732 } else if (blocky == 32) { 733 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 734 } else { 735 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 736 } 737 break; 738 case 8: 739 /* each block handles approximately one slice */ 740 nchunks = cudastruct->totalchunks; 741 blocky = a->chunksize / 32; 742 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 743 nblocks = 1 + (nchunks - 1) / chunksperblock; 744 chunk_slice_map = cudastruct->chunk_slice_map; 745 if (blocky == 2) { 746 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 747 } else if (blocky == 4) { 748 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 749 } else if (blocky == 8) { 750 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 751 } else if (blocky == 16) { 752 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 753 } else if (blocky == 32) { 754 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 755 } else { 756 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 757 } 758 break; 759 case 7: 760 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 761 if (blocky == 2) { 762 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 763 } else if (blocky == 4) { 764 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 765 } else if (blocky == 8) { 766 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 767 } else if (blocky == 16) { 768 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 769 } else if (blocky == 32) { 770 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 771 } else { 772 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 773 } 774 break; 775 case 6: 776 nblocks = 1 + (nrows - 1) / (blocksize / 32); 777 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 778 break; 779 case 5: 780 nblocks = 1 + (nrows - 1) / (blocksize / 16); 781 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 782 break; 783 case 4: 784 nblocks = 1 + (nrows - 1) / (blocksize / 8); 785 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 786 break; 787 case 3: 788 nblocks = 1 + (nrows - 1) / (blocksize / 4); 789 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 790 break; 791 case 2: 792 nblocks = 1 + (nrows - 1) / (blocksize / 2); 793 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 794 break; 795 case 1: 796 nblocks = 1 + (nrows - 1) / blocksize; 797 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 798 break; 799 case 0: 800 maxoveravg = a->maxslicewidth / a->avgslicewidth; 801 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 802 /* each block handles approximately one slice */ 803 nchunks = cudastruct->totalchunks; 804 blocky = a->chunksize / 32; 805 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 806 nblocks = 1 + (nchunks - 1) / chunksperblock; 807 chunk_slice_map = cudastruct->chunk_slice_map; 808 if (blocky == 2) { 809 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 810 } else if (blocky == 4) { 811 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 812 } else if (blocky == 8) { 813 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 814 } else if (blocky == 16) { 815 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 816 } else if (blocky == 32) { 817 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 818 } else { 819 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 820 } 821 } else { 822 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 823 if (avgslicesize <= 432) { 824 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 825 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 826 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 827 } else { 828 nblocks = 1 + (nrows - 1) / sliceheight; 829 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 830 } 831 } else if (avgslicesize <= 2400) { 832 nblocks = 1 + (nrows - 1) / sliceheight; 833 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 834 } else { 835 nblocks = 1 + (nrows - 1) / sliceheight; 836 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 837 } 838 } 839 break; 840 } 841 PetscCall(PetscLogGpuTimeEnd()); 842 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 843 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 844 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 845 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 846 } else { 847 PetscCall(VecCopy(yy, zz)); 848 } 849 PetscFunctionReturn(PETSC_SUCCESS); 850 } 851 852 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 853 { 854 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 855 PetscInt kernel, blocky; 856 PetscBool flg; 857 858 PetscFunctionBegin; 859 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 860 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 861 if (flg) { 862 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 863 cudastruct->blocky = blocky; 864 } 865 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 866 if (flg) { 867 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 868 cudastruct->kernelchoice = kernel; 869 if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 870 } 871 PetscOptionsHeadEnd(); 872 PetscFunctionReturn(PETSC_SUCCESS); 873 } 874 875 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 876 { 877 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 878 879 PetscFunctionBegin; 880 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 881 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 882 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 887 { 888 PetscFunctionBegin; 889 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 890 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 891 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 892 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 893 A->ops->mult = MatMult_SeqSELLCUDA; 894 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 895 PetscFunctionReturn(PETSC_SUCCESS); 896 } 897 898 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 899 { 900 PetscFunctionBegin; 901 if (A->factortype == MAT_FACTOR_NONE) { 902 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 903 } 904 PetscCall(MatDestroy_SeqSELL(A)); 905 PetscFunctionReturn(PETSC_SUCCESS); 906 } 907 908 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 909 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 910 { 911 PetscFunctionBegin; 912 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 913 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 914 PetscFunctionReturn(PETSC_SUCCESS); 915 } 916 917 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 918 { 919 Mat_SeqSELLCUDA *cudastruct; 920 921 PetscFunctionBegin; 922 PetscCall(PetscFree(B->defaultvectype)); 923 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 924 925 if (!B->spptr) { 926 if (B->factortype == MAT_FACTOR_NONE) { 927 PetscCall(PetscNew(&cudastruct)); 928 B->spptr = cudastruct; 929 } 930 } 931 932 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 933 B->ops->destroy = MatDestroy_SeqSELLCUDA; 934 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 935 B->ops->mult = MatMult_SeqSELLCUDA; 936 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 937 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 938 939 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 940 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 941 942 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 943 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 944 PetscFunctionReturn(PETSC_SUCCESS); 945 } 946 947 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 948 { 949 PetscFunctionBegin; 950 PetscCall(MatCreate_SeqSELL(B)); 951 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 952 PetscFunctionReturn(PETSC_SUCCESS); 953 } 954