1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt *colidx; /* column index */ 10 MatScalar *val; 11 PetscInt *sliidx; 12 PetscInt nonzerostate; 13 PetscInt kernelchoice; 14 PetscInt blocky; 15 } Mat_SeqSELLCUDA; 16 17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 18 { 19 PetscFunctionBegin; 20 if (*cudastruct) { 21 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 22 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 23 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 24 PetscCall(PetscFree(*cudastruct)); 25 } 26 PetscFunctionReturn(PETSC_SUCCESS); 27 } 28 29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 30 { 31 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 32 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 33 34 PetscFunctionBegin; 35 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 36 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 37 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 38 /* copy values only */ 39 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 40 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 41 } else { 42 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 43 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 44 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 45 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 46 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 47 /* copy values, nz or maxallocmat? */ 48 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 49 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 50 51 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 52 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 53 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 54 cudastruct->nonzerostate = A->nonzerostate; 55 } 56 PetscCallCUDA(WaitForCUDA()); 57 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 58 A->offloadmask = PETSC_OFFLOAD_BOTH; 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 64 { 65 PetscInt i, row, slice_id, row_in_slice; 66 MatScalar sum; 67 /* one thread per row. */ 68 row = blockIdx.x * blockDim.x + threadIdx.x; 69 if (row < nrows) { 70 slice_id = row / sliceheight; 71 row_in_slice = row % sliceheight; 72 sum = 0.0; 73 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 74 y[row] = sum; 75 } 76 } 77 78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 79 { 80 PetscInt i, row, slice_id, row_in_slice; 81 MatScalar sum; 82 /* one thread per row. */ 83 row = blockIdx.x * blockDim.x + threadIdx.x; 84 if (row < nrows) { 85 slice_id = row / sliceheight; 86 row_in_slice = row % sliceheight; 87 sum = 0.0; 88 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 89 z[row] = y[row] + sum; 90 } 91 } 92 93 /* use 1 block per slice, suitable for large slice width */ 94 template <int BLOCKY> 95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 96 { 97 __shared__ MatScalar shared[32][BLOCKY]; 98 PetscInt i, row, slice_id = blockIdx.x; 99 int tid = threadIdx.x + threadIdx.y * 32; 100 /* transposed index */ 101 int tidx = tid % BLOCKY; 102 int tidy = tid / BLOCKY; 103 PetscScalar t = 0.0; 104 105 row = slice_id * sliceheight + threadIdx.x % sliceheight; 106 if (row < nrows) { 107 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 108 } 109 #pragma unroll 110 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 111 /* transpose layout to reduce each row using warp shfl */ 112 shared[threadIdx.x][threadIdx.y] = t; 113 __syncthreads(); 114 t = shared[tidy][tidx]; 115 #pragma unroll 116 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 117 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 118 __syncthreads(); 119 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 120 } 121 122 /* use 1 warp per slice, suitable for small slice width */ 123 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 124 { 125 PetscInt i, row, slice_id; 126 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 127 row = slice_id * sliceheight + threadIdx.x % sliceheight; 128 double t = 0.0; 129 if (row < nrows) { 130 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 131 } 132 #pragma unroll 133 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 134 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 135 } 136 137 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 138 { 139 __shared__ MatScalar shared[512]; 140 PetscInt i, row, slice_id, row_in_slice; 141 /* multiple threads per row. */ 142 row = blockIdx.x * blockDim.x + threadIdx.x; 143 if (row < nrows) { 144 slice_id = row / SLICE_HEIGHT; 145 row_in_slice = row % SLICE_HEIGHT; 146 147 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 148 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 149 __syncthreads(); 150 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 151 __syncthreads(); 152 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 153 __syncthreads(); 154 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 155 __syncthreads(); 156 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 157 __syncthreads(); 158 if (threadIdx.y < 1) { 159 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 160 y[row] = shared[threadIdx.x]; 161 } 162 } 163 } 164 165 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 166 { 167 __shared__ MatScalar shared[512]; 168 PetscInt i, row, slice_id, row_in_slice; 169 /* multiple threads per row. */ 170 row = blockIdx.x * blockDim.x + threadIdx.x; 171 if (row < nrows) { 172 slice_id = row / SLICE_HEIGHT; 173 row_in_slice = row % SLICE_HEIGHT; 174 175 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 176 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 177 __syncthreads(); 178 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 179 __syncthreads(); 180 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 181 __syncthreads(); 182 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 183 __syncthreads(); 184 if (threadIdx.y < 1) { 185 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 186 y[row] = shared[threadIdx.x]; 187 } 188 } 189 } 190 191 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 192 { 193 __shared__ MatScalar shared[512]; 194 PetscInt i, row, slice_id, row_in_slice; 195 /* multiple threads per row. */ 196 row = blockIdx.x * blockDim.x + threadIdx.x; 197 if (row < nrows) { 198 slice_id = row / SLICE_HEIGHT; 199 row_in_slice = row % SLICE_HEIGHT; 200 201 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 202 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 203 __syncthreads(); 204 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 205 __syncthreads(); 206 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 207 __syncthreads(); 208 if (threadIdx.y < 1) { 209 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 210 y[row] = shared[threadIdx.x]; 211 } 212 } 213 } 214 215 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 216 { 217 __shared__ MatScalar shared[512]; 218 PetscInt i, row, slice_id, row_in_slice; 219 /* multiple threads per row. */ 220 row = blockIdx.x * blockDim.x + threadIdx.x; 221 if (row < nrows) { 222 slice_id = row / SLICE_HEIGHT; 223 row_in_slice = row % SLICE_HEIGHT; 224 225 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 226 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 227 __syncthreads(); 228 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 229 __syncthreads(); 230 if (threadIdx.y < 1) { 231 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 232 y[row] = shared[threadIdx.x]; 233 } 234 } 235 } 236 237 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 238 { 239 __shared__ MatScalar shared[512]; 240 PetscInt i, row, slice_id, row_in_slice; 241 /* multiple threads per row. */ 242 row = blockIdx.x * blockDim.x + threadIdx.x; 243 if (row < nrows) { 244 slice_id = row / SLICE_HEIGHT; 245 row_in_slice = row % SLICE_HEIGHT; 246 247 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 248 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 249 __syncthreads(); 250 if (threadIdx.y < 1) { 251 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 252 y[row] = shared[threadIdx.x]; 253 } 254 } 255 } 256 257 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 258 { 259 __shared__ MatScalar shared[512]; 260 PetscInt i, row, slice_id, row_in_slice; 261 /* multiple threads per row. */ 262 row = blockIdx.x * blockDim.x + threadIdx.x; 263 if (row < nrows) { 264 slice_id = row / SLICE_HEIGHT; 265 row_in_slice = row % SLICE_HEIGHT; 266 267 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 268 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 269 __syncthreads(); 270 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 271 __syncthreads(); 272 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 273 __syncthreads(); 274 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 275 __syncthreads(); 276 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 277 __syncthreads(); 278 if (threadIdx.y < 1) { 279 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 280 z[row] = y[row] + shared[threadIdx.x]; 281 } 282 } 283 } 284 285 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 286 { 287 __shared__ MatScalar shared[512]; 288 PetscInt i, row, slice_id, row_in_slice; 289 /* multiple threads per row. */ 290 row = blockIdx.x * blockDim.x + threadIdx.x; 291 if (row < nrows) { 292 slice_id = row / SLICE_HEIGHT; 293 row_in_slice = row % SLICE_HEIGHT; 294 295 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 296 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 297 __syncthreads(); 298 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 299 __syncthreads(); 300 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 301 __syncthreads(); 302 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 303 __syncthreads(); 304 if (threadIdx.y < 1) { 305 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 306 z[row] = y[row] + shared[threadIdx.x]; 307 } 308 } 309 } 310 311 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 312 { 313 __shared__ MatScalar shared[512]; 314 PetscInt i, row, slice_id, row_in_slice; 315 /* multiple threads per row. */ 316 row = blockIdx.x * blockDim.x + threadIdx.x; 317 if (row < nrows) { 318 slice_id = row / SLICE_HEIGHT; 319 row_in_slice = row % SLICE_HEIGHT; 320 321 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 322 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 323 __syncthreads(); 324 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 325 __syncthreads(); 326 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 327 __syncthreads(); 328 if (threadIdx.y < 1) { 329 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 330 z[row] = y[row] + shared[threadIdx.x]; 331 } 332 } 333 } 334 335 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 336 { 337 __shared__ MatScalar shared[512]; 338 PetscInt i, row, slice_id, row_in_slice; 339 /* multiple threads per row. */ 340 row = blockIdx.x * blockDim.x + threadIdx.x; 341 if (row < nrows) { 342 slice_id = row / SLICE_HEIGHT; 343 row_in_slice = row % SLICE_HEIGHT; 344 345 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 346 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 347 __syncthreads(); 348 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 349 __syncthreads(); 350 if (threadIdx.y < 1) { 351 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 352 z[row] = y[row] + shared[threadIdx.x]; 353 } 354 } 355 } 356 357 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 358 { 359 __shared__ MatScalar shared[512]; 360 PetscInt i, row, slice_id, row_in_slice; 361 /* multiple threads per row. */ 362 row = blockIdx.x * blockDim.x + threadIdx.x; 363 if (row < nrows) { 364 slice_id = row / SLICE_HEIGHT; 365 row_in_slice = row % SLICE_HEIGHT; 366 367 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 368 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 369 __syncthreads(); 370 if (threadIdx.y < 1) { 371 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 372 z[row] = y[row] + shared[threadIdx.x]; 373 } 374 } 375 } 376 377 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 378 { 379 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 380 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 381 PetscScalar *y; 382 const PetscScalar *x; 383 PetscInt totalslices = a->totalslices, nrows = A->rmap->n, sliceheight = a->sliceheight; 384 MatScalar *aval; 385 PetscInt *acolidx; 386 PetscInt *sliidx; 387 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 388 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 389 390 PetscFunctionBegin; 391 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 392 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 393 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 394 aval = cudastruct->val; 395 acolidx = cudastruct->colidx; 396 sliidx = cudastruct->sliidx; 397 398 PetscCall(VecCUDAGetArrayRead(xx, &x)); 399 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 400 PetscCall(PetscLogGpuTimeBegin()); 401 402 switch (cudastruct->kernelchoice) { 403 case 9: 404 nblocks = 1 + (nrows - 1) / sliceheight; 405 if (cudastruct->blocky == 2) { 406 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 407 } else if (cudastruct->blocky == 4) { 408 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 409 } else if (cudastruct->blocky == 8) { 410 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 411 } else if (cudastruct->blocky == 16) { 412 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 413 } else if (cudastruct->blocky == 32) { 414 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 415 } 416 break; 417 case 7: 418 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 419 if (cudastruct->blocky == 2) { 420 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 421 } else if (cudastruct->blocky == 4) { 422 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 423 } else if (cudastruct->blocky == 8) { 424 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 425 } else if (cudastruct->blocky == 16) { 426 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 427 } else if (cudastruct->blocky == 32) { 428 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 429 } else { 430 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 431 } 432 break; 433 case 6: 434 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 435 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 436 break; 437 case 5: 438 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 439 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 440 break; 441 case 4: 442 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 443 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 444 break; 445 case 3: 446 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 447 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 448 break; 449 case 2: /* 16 slices per block if blocksize=512 */ 450 nblocks = 1 + (nrows - 1) / (blocksize / 2); 451 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 452 break; 453 case 1: /* 32 slices per block if blocksize=512 */ 454 nblocks = 1 + (nrows - 1) / blocksize; 455 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 456 break; 457 case 0: 458 if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */ 459 nblocks = 1 + (nrows - 1) / sliceheight; 460 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 461 } else { 462 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 463 if (avgslicesize <= 96) { 464 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 465 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 466 } else if (avgslicesize <= 432) { 467 nblocks = 1 + (nrows - 1) / sliceheight; 468 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 469 } else if (avgslicesize <= 2400) { 470 nblocks = 1 + (nrows - 1) / sliceheight; 471 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 472 } else { 473 nblocks = 1 + (nrows - 1) / sliceheight; 474 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 475 } 476 } 477 break; 478 } 479 PetscCall(PetscLogGpuTimeEnd()); 480 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 481 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 482 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 483 PetscFunctionReturn(PETSC_SUCCESS); 484 } 485 486 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 487 { 488 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 489 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 490 PetscScalar *z; 491 const PetscScalar *y, *x; 492 PetscInt totalslices = a->totalslices, nrows = A->rmap->n, sliceheight = a->sliceheight; 493 MatScalar *aval = cudastruct->val; 494 PetscInt *acolidx = cudastruct->colidx; 495 PetscInt *sliidx = cudastruct->sliidx; 496 497 PetscFunctionBegin; 498 PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 499 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 500 if (a->nz) { 501 PetscInt nblocks, blocksize = 512; 502 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 503 PetscCall(VecCUDAGetArrayRead(xx, &x)); 504 PetscCall(VecCUDAGetArrayRead(yy, &y)); 505 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 506 PetscCall(PetscLogGpuTimeBegin()); 507 508 switch (cudastruct->kernelchoice) { 509 case 6: 510 nblocks = 1 + (nrows - 1) / (blocksize / 32); 511 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 512 break; 513 case 5: 514 nblocks = 1 + (nrows - 1) / (blocksize / 16); 515 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 516 break; 517 case 4: 518 nblocks = 1 + (nrows - 1) / (blocksize / 8); 519 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 520 break; 521 case 3: 522 nblocks = 1 + (nrows - 1) / (blocksize / 4); 523 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 524 break; 525 case 2: 526 nblocks = 1 + (nrows - 1) / (blocksize / 2); 527 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 528 break; 529 case 1: 530 nblocks = 1 + (nrows - 1) / blocksize; 531 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 532 break; 533 case 0: /* TODO */ 534 break; 535 } 536 PetscCall(PetscLogGpuTimeEnd()); 537 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 538 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 539 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 540 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 541 } else { 542 PetscCall(VecCopy(yy, zz)); 543 } 544 PetscFunctionReturn(PETSC_SUCCESS); 545 } 546 547 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 548 { 549 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 550 PetscInt kernel, blocky; 551 PetscBool flg; 552 553 PetscFunctionBegin; 554 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 555 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 556 if (flg) { 557 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 558 cudastruct->kernelchoice = kernel; 559 } 560 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 561 if (flg) { 562 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel); 563 cudastruct->blocky = blocky; 564 } 565 PetscOptionsHeadEnd(); 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 569 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 570 { 571 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 572 573 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 574 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 575 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 576 PetscFunctionReturn(PETSC_SUCCESS); 577 } 578 579 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 580 { 581 PetscFunctionBegin; 582 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 583 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 584 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 585 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 586 A->ops->mult = MatMult_SeqSELLCUDA; 587 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 592 { 593 PetscFunctionBegin; 594 if (A->factortype == MAT_FACTOR_NONE) { 595 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 596 } 597 PetscCall(MatDestroy_SeqSELL(A)); 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 602 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 603 { 604 PetscFunctionBegin; 605 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 606 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 607 PetscFunctionReturn(PETSC_SUCCESS); 608 } 609 610 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 611 { 612 Mat_SeqSELLCUDA *cudastruct; 613 614 PetscFunctionBegin; 615 PetscCall(PetscFree(B->defaultvectype)); 616 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 617 618 if (!B->spptr) { 619 if (B->factortype == MAT_FACTOR_NONE) { 620 PetscCall(PetscNew(&cudastruct)); 621 B->spptr = cudastruct; 622 } 623 } 624 625 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 626 B->ops->destroy = MatDestroy_SeqSELLCUDA; 627 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 628 B->ops->mult = MatMult_SeqSELLCUDA; 629 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 630 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 631 632 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 633 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 634 635 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 636 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 637 PetscFunctionReturn(PETSC_SUCCESS); 638 } 639 640 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 641 { 642 PetscFunctionBegin; 643 PetscCall(MatCreate_SeqSELL(B)); 644 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 645 PetscFunctionReturn(PETSC_SUCCESS); 646 } 647