1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt *colidx; /* column index */ 10 MatScalar *val; 11 PetscInt *sliidx; 12 PetscInt nonzerostate; 13 PetscInt kernelchoice; 14 PetscInt blocky; 15 } Mat_SeqSELLCUDA; 16 17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 18 { 19 PetscFunctionBegin; 20 if (*cudastruct) { 21 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 22 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 23 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 24 PetscCall(PetscFree(*cudastruct)); 25 } 26 PetscFunctionReturn(PETSC_SUCCESS); 27 } 28 29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 30 { 31 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 32 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 33 34 PetscFunctionBegin; 35 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 36 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 37 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 38 /* copy values only */ 39 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 40 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 41 } else { 42 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 43 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 44 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 45 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 46 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 47 /* copy values, nz or maxallocmat? */ 48 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 49 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 50 51 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 52 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 53 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 54 cudastruct->nonzerostate = A->nonzerostate; 55 } 56 PetscCallCUDA(WaitForCUDA()); 57 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 58 A->offloadmask = PETSC_OFFLOAD_BOTH; 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 64 { 65 PetscInt i, row, slice_id, row_in_slice; 66 MatScalar sum; 67 /* one thread per row. */ 68 row = blockIdx.x * blockDim.x + threadIdx.x; 69 if (row < nrows) { 70 slice_id = row / sliceheight; 71 row_in_slice = row % sliceheight; 72 sum = 0.0; 73 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 74 y[row] = sum; 75 } 76 } 77 78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 79 { 80 PetscInt i, row, slice_id, row_in_slice; 81 MatScalar sum; 82 /* one thread per row. */ 83 row = blockIdx.x * blockDim.x + threadIdx.x; 84 if (row < nrows) { 85 slice_id = row / sliceheight; 86 row_in_slice = row % sliceheight; 87 sum = 0.0; 88 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 89 z[row] = y[row] + sum; 90 } 91 } 92 93 /* use 1 block per slice, suitable for large slice width */ 94 template <int BLOCKY> 95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 96 { 97 __shared__ MatScalar shared[32][BLOCKY]; 98 PetscInt i, row, slice_id = blockIdx.x; 99 int tid = threadIdx.x + threadIdx.y * 32; 100 /* transposed index */ 101 int tidx = tid % BLOCKY; 102 int tidy = tid / BLOCKY; 103 PetscScalar t = 0.0; 104 105 row = slice_id * sliceheight + threadIdx.x % sliceheight; 106 if (row < nrows) { 107 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 108 } 109 #pragma unroll 110 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 111 /* transpose layout to reduce each row using warp shfl */ 112 shared[threadIdx.x][threadIdx.y] = t; 113 __syncthreads(); 114 t = shared[tidy][tidx]; 115 #pragma unroll 116 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 117 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 118 __syncthreads(); 119 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 120 } 121 122 /* use 1 warp per slice, suitable for small slice width */ 123 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 124 { 125 PetscInt i, row, slice_id; 126 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 127 row = slice_id * sliceheight + threadIdx.x % sliceheight; 128 double t = 0.0; 129 if (row < nrows) { 130 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 131 } 132 #pragma unroll 133 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 134 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 135 } 136 137 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 138 139 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 140 { 141 __shared__ MatScalar shared[512]; 142 PetscInt i, row, slice_id, row_in_slice; 143 /* multiple threads per row. */ 144 row = blockIdx.x * blockDim.x + threadIdx.x; 145 if (row < nrows) { 146 slice_id = row / SLICE_HEIGHT; 147 row_in_slice = row % SLICE_HEIGHT; 148 149 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 150 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 151 __syncthreads(); 152 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 153 __syncthreads(); 154 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 155 __syncthreads(); 156 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 157 __syncthreads(); 158 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 159 __syncthreads(); 160 if (threadIdx.y < 1) { 161 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 162 y[row] = shared[threadIdx.x]; 163 } 164 } 165 } 166 167 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 168 { 169 __shared__ MatScalar shared[512]; 170 PetscInt i, row, slice_id, row_in_slice; 171 /* multiple threads per row. */ 172 row = blockIdx.x * blockDim.x + threadIdx.x; 173 if (row < nrows) { 174 slice_id = row / SLICE_HEIGHT; 175 row_in_slice = row % SLICE_HEIGHT; 176 177 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 178 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 179 __syncthreads(); 180 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 181 __syncthreads(); 182 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 183 __syncthreads(); 184 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 185 __syncthreads(); 186 if (threadIdx.y < 1) { 187 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 188 y[row] = shared[threadIdx.x]; 189 } 190 } 191 } 192 193 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 194 { 195 __shared__ MatScalar shared[512]; 196 PetscInt i, row, slice_id, row_in_slice; 197 /* multiple threads per row. */ 198 row = blockIdx.x * blockDim.x + threadIdx.x; 199 if (row < nrows) { 200 slice_id = row / SLICE_HEIGHT; 201 row_in_slice = row % SLICE_HEIGHT; 202 203 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 204 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 205 __syncthreads(); 206 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 207 __syncthreads(); 208 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 209 __syncthreads(); 210 if (threadIdx.y < 1) { 211 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 212 y[row] = shared[threadIdx.x]; 213 } 214 } 215 } 216 217 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 218 { 219 __shared__ MatScalar shared[512]; 220 PetscInt i, row, slice_id, row_in_slice; 221 /* multiple threads per row. */ 222 row = blockIdx.x * blockDim.x + threadIdx.x; 223 if (row < nrows) { 224 slice_id = row / SLICE_HEIGHT; 225 row_in_slice = row % SLICE_HEIGHT; 226 227 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 228 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 229 __syncthreads(); 230 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 231 __syncthreads(); 232 if (threadIdx.y < 1) { 233 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 234 y[row] = shared[threadIdx.x]; 235 } 236 } 237 } 238 239 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 240 { 241 __shared__ MatScalar shared[512]; 242 PetscInt i, row, slice_id, row_in_slice; 243 /* multiple threads per row. */ 244 row = blockIdx.x * blockDim.x + threadIdx.x; 245 if (row < nrows) { 246 slice_id = row / SLICE_HEIGHT; 247 row_in_slice = row % SLICE_HEIGHT; 248 249 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 250 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 251 __syncthreads(); 252 if (threadIdx.y < 1) { 253 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 254 y[row] = shared[threadIdx.x]; 255 } 256 } 257 } 258 259 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 260 { 261 __shared__ MatScalar shared[512]; 262 PetscInt i, row, slice_id, row_in_slice; 263 /* multiple threads per row. */ 264 row = blockIdx.x * blockDim.x + threadIdx.x; 265 if (row < nrows) { 266 slice_id = row / SLICE_HEIGHT; 267 row_in_slice = row % SLICE_HEIGHT; 268 269 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 270 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 271 __syncthreads(); 272 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 273 __syncthreads(); 274 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 275 __syncthreads(); 276 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 277 __syncthreads(); 278 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 279 __syncthreads(); 280 if (threadIdx.y < 1) { 281 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 282 z[row] = y[row] + shared[threadIdx.x]; 283 } 284 } 285 } 286 287 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 288 { 289 __shared__ MatScalar shared[512]; 290 PetscInt i, row, slice_id, row_in_slice; 291 /* multiple threads per row. */ 292 row = blockIdx.x * blockDim.x + threadIdx.x; 293 if (row < nrows) { 294 slice_id = row / SLICE_HEIGHT; 295 row_in_slice = row % SLICE_HEIGHT; 296 297 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 298 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 299 __syncthreads(); 300 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 301 __syncthreads(); 302 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 303 __syncthreads(); 304 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 305 __syncthreads(); 306 if (threadIdx.y < 1) { 307 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 308 z[row] = y[row] + shared[threadIdx.x]; 309 } 310 } 311 } 312 313 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 314 { 315 __shared__ MatScalar shared[512]; 316 PetscInt i, row, slice_id, row_in_slice; 317 /* multiple threads per row. */ 318 row = blockIdx.x * blockDim.x + threadIdx.x; 319 if (row < nrows) { 320 slice_id = row / SLICE_HEIGHT; 321 row_in_slice = row % SLICE_HEIGHT; 322 323 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 324 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 325 __syncthreads(); 326 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 327 __syncthreads(); 328 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 329 __syncthreads(); 330 if (threadIdx.y < 1) { 331 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 332 z[row] = y[row] + shared[threadIdx.x]; 333 } 334 } 335 } 336 337 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 338 { 339 __shared__ MatScalar shared[512]; 340 PetscInt i, row, slice_id, row_in_slice; 341 /* multiple threads per row. */ 342 row = blockIdx.x * blockDim.x + threadIdx.x; 343 if (row < nrows) { 344 slice_id = row / SLICE_HEIGHT; 345 row_in_slice = row % SLICE_HEIGHT; 346 347 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 348 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 349 __syncthreads(); 350 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 351 __syncthreads(); 352 if (threadIdx.y < 1) { 353 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 354 z[row] = y[row] + shared[threadIdx.x]; 355 } 356 } 357 } 358 359 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 360 { 361 __shared__ MatScalar shared[512]; 362 PetscInt i, row, slice_id, row_in_slice; 363 /* multiple threads per row. */ 364 row = blockIdx.x * blockDim.x + threadIdx.x; 365 if (row < nrows) { 366 slice_id = row / SLICE_HEIGHT; 367 row_in_slice = row % SLICE_HEIGHT; 368 369 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 370 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 371 __syncthreads(); 372 if (threadIdx.y < 1) { 373 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 374 z[row] = y[row] + shared[threadIdx.x]; 375 } 376 } 377 } 378 379 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 380 { 381 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 382 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 383 PetscScalar *y; 384 const PetscScalar *x; 385 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 386 MatScalar *aval; 387 PetscInt *acolidx; 388 PetscInt *sliidx; 389 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 390 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 391 392 PetscFunctionBegin; 393 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 394 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 395 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 396 aval = cudastruct->val; 397 acolidx = cudastruct->colidx; 398 sliidx = cudastruct->sliidx; 399 400 PetscCall(VecCUDAGetArrayRead(xx, &x)); 401 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 402 PetscCall(PetscLogGpuTimeBegin()); 403 404 switch (cudastruct->kernelchoice) { 405 case 9: 406 nblocks = 1 + (nrows - 1) / sliceheight; 407 if (cudastruct->blocky == 2) { 408 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 409 } else if (cudastruct->blocky == 4) { 410 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 411 } else if (cudastruct->blocky == 8) { 412 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 413 } else if (cudastruct->blocky == 16) { 414 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 415 } else if (cudastruct->blocky == 32) { 416 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 417 } 418 break; 419 case 7: 420 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 421 if (cudastruct->blocky == 2) { 422 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 423 } else if (cudastruct->blocky == 4) { 424 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 425 } else if (cudastruct->blocky == 8) { 426 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 427 } else if (cudastruct->blocky == 16) { 428 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 429 } else if (cudastruct->blocky == 32) { 430 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 431 } else { 432 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 433 } 434 break; 435 case 6: 436 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 437 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 438 break; 439 case 5: 440 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 441 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 442 break; 443 case 4: 444 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 445 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 446 break; 447 case 3: 448 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 449 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 450 break; 451 case 2: /* 16 slices per block if blocksize=512 */ 452 nblocks = 1 + (nrows - 1) / (blocksize / 2); 453 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 454 break; 455 case 1: /* 32 slices per block if blocksize=512 */ 456 nblocks = 1 + (nrows - 1) / blocksize; 457 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 458 break; 459 case 0: 460 if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */ 461 nblocks = 1 + (nrows - 1) / sliceheight; 462 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 463 } else { 464 nblocks = 1 + (nrows - 1) / sliceheight; 465 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 466 } 467 } 468 PetscCall(PetscLogGpuTimeEnd()); 469 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 470 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 471 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 472 PetscFunctionReturn(PETSC_SUCCESS); 473 } 474 475 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 476 { 477 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 478 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 479 PetscScalar *z; 480 const PetscScalar *y, *x; 481 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 482 MatScalar *aval = cudastruct->val; 483 PetscInt *acolidx = cudastruct->colidx; 484 PetscInt *sliidx = cudastruct->sliidx; 485 486 PetscFunctionBegin; 487 PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 488 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 489 if (a->nz) { 490 PetscInt nblocks, blocksize = 512; 491 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 492 PetscCall(VecCUDAGetArrayRead(xx, &x)); 493 PetscCall(VecCUDAGetArrayRead(yy, &y)); 494 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 495 PetscCall(PetscLogGpuTimeBegin()); 496 497 switch (cudastruct->kernelchoice) { 498 case 6: 499 nblocks = 1 + (nrows - 1) / (blocksize / 32); 500 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 501 break; 502 case 5: 503 nblocks = 1 + (nrows - 1) / (blocksize / 16); 504 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 505 break; 506 case 4: 507 nblocks = 1 + (nrows - 1) / (blocksize / 8); 508 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 509 break; 510 case 3: 511 nblocks = 1 + (nrows - 1) / (blocksize / 4); 512 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 513 break; 514 case 2: 515 nblocks = 1 + (nrows - 1) / (blocksize / 2); 516 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 517 break; 518 case 1: 519 nblocks = 1 + (nrows - 1) / blocksize; 520 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 521 break; 522 case 0: /* TODO */ 523 break; 524 } 525 PetscCall(PetscLogGpuTimeEnd()); 526 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 527 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 528 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 529 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 530 } else { 531 PetscCall(VecCopy(yy, zz)); 532 } 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 537 { 538 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 539 PetscInt kernel, blocky; 540 PetscBool flg; 541 542 PetscFunctionBegin; 543 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 544 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 545 if (flg) { 546 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 547 cudastruct->kernelchoice = kernel; 548 } 549 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 550 if (flg) { 551 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel); 552 cudastruct->blocky = blocky; 553 } 554 PetscOptionsHeadEnd(); 555 PetscFunctionReturn(PETSC_SUCCESS); 556 } 557 558 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 559 { 560 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 561 562 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 563 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 564 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 569 { 570 PetscFunctionBegin; 571 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 572 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 573 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 574 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 575 A->ops->mult = MatMult_SeqSELLCUDA; 576 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 577 PetscFunctionReturn(PETSC_SUCCESS); 578 } 579 580 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 581 { 582 PetscFunctionBegin; 583 if (A->factortype == MAT_FACTOR_NONE) { 584 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 585 } 586 PetscCall(MatDestroy_SeqSELL(A)); 587 PetscFunctionReturn(PETSC_SUCCESS); 588 } 589 590 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 591 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 592 { 593 PetscFunctionBegin; 594 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 595 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 596 PetscFunctionReturn(PETSC_SUCCESS); 597 } 598 599 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 600 { 601 Mat_SeqSELLCUDA *cudastruct; 602 603 PetscFunctionBegin; 604 PetscCall(PetscFree(B->defaultvectype)); 605 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 606 607 if (!B->spptr) { 608 if (B->factortype == MAT_FACTOR_NONE) { 609 PetscCall(PetscNew(&cudastruct)); 610 B->spptr = cudastruct; 611 } 612 } 613 614 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 615 B->ops->destroy = MatDestroy_SeqSELLCUDA; 616 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 617 B->ops->mult = MatMult_SeqSELLCUDA; 618 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 619 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 620 621 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 622 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 623 624 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 625 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 626 PetscFunctionReturn(PETSC_SUCCESS); 627 } 628 629 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 630 { 631 PetscFunctionBegin; 632 PetscCall(MatCreate_SeqSELL(B)); 633 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 634 PetscFunctionReturn(PETSC_SUCCESS); 635 } 636