1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt *colidx; /* column index */ 10 MatScalar *val; 11 PetscInt *sliidx; 12 PetscInt nonzerostate; 13 PetscInt kernelchoice; 14 } Mat_SeqSELLCUDA; 15 16 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 17 { 18 PetscFunctionBegin; 19 if (*cudastruct) { 20 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 21 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 22 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 23 PetscCall(PetscFree(*cudastruct)); 24 } 25 PetscFunctionReturn(PETSC_SUCCESS); 26 } 27 28 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 29 { 30 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 31 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 32 33 PetscFunctionBegin; 34 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 35 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 36 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 37 /* copy values only */ 38 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 39 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 40 } else { 41 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 42 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 43 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 44 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 45 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 46 /* copy values, nz or maxallocmat? */ 47 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 48 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 49 50 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 51 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 52 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 53 cudastruct->nonzerostate = A->nonzerostate; 54 } 55 PetscCallCUDA(WaitForCUDA()); 56 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 57 A->offloadmask = PETSC_OFFLOAD_BOTH; 58 } 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 63 { 64 PetscInt i, row, slice_id, row_in_slice; 65 MatScalar sum; 66 /* one thread per row. */ 67 row = blockIdx.x * blockDim.x + threadIdx.x; 68 if (row < nrows) { 69 slice_id = row / SLICE_HEIGHT; 70 row_in_slice = row % SLICE_HEIGHT; 71 sum = 0.0; 72 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 73 y[row] = sum; 74 } 75 } 76 77 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 78 { 79 PetscInt i, row, slice_id, row_in_slice; 80 MatScalar sum; 81 /* one thread per row. */ 82 row = blockIdx.x * blockDim.x + threadIdx.x; 83 if (row < nrows) { 84 slice_id = row / SLICE_HEIGHT; 85 row_in_slice = row % SLICE_HEIGHT; 86 sum = 0.0; 87 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 88 z[row] = y[row] + sum; 89 } 90 } 91 92 /* use 1 block per slice, suitable for large slice width */ 93 template <int BLOCKY> 94 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 95 { 96 __shared__ MatScalar shared[SLICE_HEIGHT][BLOCKY]; 97 PetscInt i, row, slice_id; 98 slice_id = blockIdx.x; 99 row = slice_id * SLICE_HEIGHT + threadIdx.x; 100 int tid = threadIdx.x + threadIdx.y * SLICE_HEIGHT; 101 /* transposed index */ 102 int tidx = tid % BLOCKY; 103 int tidy = tid / BLOCKY; 104 PetscScalar t = 0.0; 105 if (row < nrows) { 106 for (i = sliidx[slice_id] + threadIdx.x + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * BLOCKY) t += aval[i] * x[acolidx[i]]; 107 } 108 /* transpose layout to reduce each row using warp shfl */ 109 shared[threadIdx.x][threadIdx.y] = t; 110 __syncthreads(); 111 t = shared[tidy][tidx]; 112 #pragma unroll 113 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 114 __syncthreads(); 115 if (tidx == 0) { shared[0][tidy] = t; } 116 __syncthreads(); 117 if (row < nrows && threadIdx.y == 0) { y[row] = shared[0][threadIdx.x]; } 118 } 119 120 /* use 1 warp per slice, suitable for small slice width */ 121 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 122 { 123 PetscInt i, row, slice_id; 124 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 125 row = slice_id * SLICE_HEIGHT + threadIdx.x % SLICE_HEIGHT; 126 double t = 0.0; 127 if (row < nrows) { 128 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 129 } 130 t += __shfl_down_sync(0xffffffff, t, 16); 131 if (row < nrows && threadIdx.x < 16) { y[row] = t; } 132 } 133 134 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 135 { 136 __shared__ MatScalar shared[512]; 137 PetscInt i, row, slice_id, row_in_slice; 138 /* multiple threads per row. */ 139 row = blockIdx.x * blockDim.x + threadIdx.x; 140 if (row < nrows) { 141 slice_id = row / SLICE_HEIGHT; 142 row_in_slice = row % SLICE_HEIGHT; 143 144 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 145 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 146 __syncthreads(); 147 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 148 __syncthreads(); 149 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 150 __syncthreads(); 151 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 152 __syncthreads(); 153 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 154 __syncthreads(); 155 if (threadIdx.y < 1) { 156 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 157 y[row] = shared[threadIdx.x]; 158 } 159 } 160 } 161 162 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 163 { 164 __shared__ MatScalar shared[512]; 165 PetscInt i, row, slice_id, row_in_slice; 166 /* multiple threads per row. */ 167 row = blockIdx.x * blockDim.x + threadIdx.x; 168 if (row < nrows) { 169 slice_id = row / SLICE_HEIGHT; 170 row_in_slice = row % SLICE_HEIGHT; 171 172 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 173 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 174 __syncthreads(); 175 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 176 __syncthreads(); 177 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 178 __syncthreads(); 179 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 180 __syncthreads(); 181 if (threadIdx.y < 1) { 182 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 183 y[row] = shared[threadIdx.x]; 184 } 185 } 186 } 187 188 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 189 { 190 __shared__ MatScalar shared[512]; 191 PetscInt i, row, slice_id, row_in_slice; 192 /* multiple threads per row. */ 193 row = blockIdx.x * blockDim.x + threadIdx.x; 194 if (row < nrows) { 195 slice_id = row / SLICE_HEIGHT; 196 row_in_slice = row % SLICE_HEIGHT; 197 198 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 199 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 200 __syncthreads(); 201 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 202 __syncthreads(); 203 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 204 __syncthreads(); 205 if (threadIdx.y < 1) { 206 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 207 y[row] = shared[threadIdx.x]; 208 } 209 } 210 } 211 212 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 213 { 214 __shared__ MatScalar shared[512]; 215 PetscInt i, row, slice_id, row_in_slice; 216 /* multiple threads per row. */ 217 row = blockIdx.x * blockDim.x + threadIdx.x; 218 if (row < nrows) { 219 slice_id = row / SLICE_HEIGHT; 220 row_in_slice = row % SLICE_HEIGHT; 221 222 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 223 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 224 __syncthreads(); 225 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 226 __syncthreads(); 227 if (threadIdx.y < 1) { 228 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 229 y[row] = shared[threadIdx.x]; 230 } 231 } 232 } 233 234 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 235 { 236 __shared__ MatScalar shared[512]; 237 PetscInt i, row, slice_id, row_in_slice; 238 /* multiple threads per row. */ 239 row = blockIdx.x * blockDim.x + threadIdx.x; 240 if (row < nrows) { 241 slice_id = row / SLICE_HEIGHT; 242 row_in_slice = row % SLICE_HEIGHT; 243 244 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 245 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 246 __syncthreads(); 247 if (threadIdx.y < 1) { 248 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 249 y[row] = shared[threadIdx.x]; 250 } 251 } 252 } 253 254 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 255 { 256 __shared__ MatScalar shared[512]; 257 PetscInt i, row, slice_id, row_in_slice; 258 /* multiple threads per row. */ 259 row = blockIdx.x * blockDim.x + threadIdx.x; 260 if (row < nrows) { 261 slice_id = row / SLICE_HEIGHT; 262 row_in_slice = row % SLICE_HEIGHT; 263 264 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 265 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 266 __syncthreads(); 267 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 268 __syncthreads(); 269 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 270 __syncthreads(); 271 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 272 __syncthreads(); 273 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 274 __syncthreads(); 275 if (threadIdx.y < 1) { 276 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 277 z[row] = y[row] + shared[threadIdx.x]; 278 } 279 } 280 } 281 282 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 283 { 284 __shared__ MatScalar shared[512]; 285 PetscInt i, row, slice_id, row_in_slice; 286 /* multiple threads per row. */ 287 row = blockIdx.x * blockDim.x + threadIdx.x; 288 if (row < nrows) { 289 slice_id = row / SLICE_HEIGHT; 290 row_in_slice = row % SLICE_HEIGHT; 291 292 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 293 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 294 __syncthreads(); 295 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 296 __syncthreads(); 297 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 298 __syncthreads(); 299 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 300 __syncthreads(); 301 if (threadIdx.y < 1) { 302 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 303 z[row] = y[row] + shared[threadIdx.x]; 304 } 305 } 306 } 307 308 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 309 { 310 __shared__ MatScalar shared[512]; 311 PetscInt i, row, slice_id, row_in_slice; 312 /* multiple threads per row. */ 313 row = blockIdx.x * blockDim.x + threadIdx.x; 314 if (row < nrows) { 315 slice_id = row / SLICE_HEIGHT; 316 row_in_slice = row % SLICE_HEIGHT; 317 318 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 319 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 320 __syncthreads(); 321 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 322 __syncthreads(); 323 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 324 __syncthreads(); 325 if (threadIdx.y < 1) { 326 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 327 z[row] = y[row] + shared[threadIdx.x]; 328 } 329 } 330 } 331 332 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 333 { 334 __shared__ MatScalar shared[512]; 335 PetscInt i, row, slice_id, row_in_slice; 336 /* multiple threads per row. */ 337 row = blockIdx.x * blockDim.x + threadIdx.x; 338 if (row < nrows) { 339 slice_id = row / SLICE_HEIGHT; 340 row_in_slice = row % SLICE_HEIGHT; 341 342 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 343 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 344 __syncthreads(); 345 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 346 __syncthreads(); 347 if (threadIdx.y < 1) { 348 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 349 z[row] = y[row] + shared[threadIdx.x]; 350 } 351 } 352 } 353 354 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 355 { 356 __shared__ MatScalar shared[512]; 357 PetscInt i, row, slice_id, row_in_slice; 358 /* multiple threads per row. */ 359 row = blockIdx.x * blockDim.x + threadIdx.x; 360 if (row < nrows) { 361 slice_id = row / SLICE_HEIGHT; 362 row_in_slice = row % SLICE_HEIGHT; 363 364 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 365 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 366 __syncthreads(); 367 if (threadIdx.y < 1) { 368 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 369 z[row] = y[row] + shared[threadIdx.x]; 370 } 371 } 372 } 373 374 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 375 { 376 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 377 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 378 PetscScalar *y; 379 const PetscScalar *x; 380 PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 381 MatScalar *aval; 382 PetscInt *acolidx; 383 PetscInt *sliidx; 384 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 385 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 386 dim3 block_k8(32, SLICE_HEIGHT); 387 388 PetscFunctionBegin; 389 PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 390 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 391 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 392 aval = cudastruct->val; 393 acolidx = cudastruct->colidx; 394 sliidx = cudastruct->sliidx; 395 396 PetscCall(VecCUDAGetArrayRead(xx, &x)); 397 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 398 PetscCall(PetscLogGpuTimeBegin()); 399 400 switch (cudastruct->kernelchoice) { 401 case 9: 402 if (a->maxslicewidth > 512) { 403 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 404 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 405 } else if (a->avgslicewidth < 8) { 406 nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 407 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 408 } else if (a->avgslicewidth < 16) { 409 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 410 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 411 } else if (a->avgslicewidth < 32) { 412 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 413 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 414 } else if (a->avgslicewidth < 64) { 415 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 416 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 417 } else { 418 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 419 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 420 } 421 break; 422 case 8: 423 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 424 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 425 break; 426 case 7: 427 nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 428 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 429 break; 430 case 6: 431 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 432 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 433 break; 434 case 5: 435 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 436 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 437 break; 438 case 4: 439 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 440 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 441 break; 442 case 3: 443 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 444 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 445 break; 446 case 2: /* 16 slices per block if blocksize=512 */ 447 nblocks = 1 + (nrows - 1) / (blocksize / 2); 448 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 449 break; 450 case 1: /* 32 slices per block if blocksize=512 */ 451 nblocks = 1 + (nrows - 1) / blocksize; 452 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 453 break; 454 case 0: 455 if (a->maxslicewidth > 64) { 456 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 457 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(SLICE_HEIGHT, 32)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 458 } else { 459 if (a->avgslicewidth < 8) { 460 nblocks = 1 + (nrows - 1) / (4 * SLICE_HEIGHT); 461 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 462 } else if (a->avgslicewidth < 16) { 463 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 464 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(SLICE_HEIGHT, 4)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 465 } else if (a->avgslicewidth < 32) { 466 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 467 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(SLICE_HEIGHT, 8)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 468 } else { 469 nblocks = 1 + (nrows - 1) / (SLICE_HEIGHT); 470 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(SLICE_HEIGHT, 16)>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 471 } 472 } 473 break; 474 } 475 PetscCall(PetscLogGpuTimeEnd()); 476 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 477 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 478 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 479 PetscFunctionReturn(PETSC_SUCCESS); 480 } 481 482 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 483 { 484 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 485 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 486 PetscScalar *z; 487 const PetscScalar *y, *x; 488 PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 489 MatScalar *aval = cudastruct->val; 490 PetscInt *acolidx = cudastruct->colidx; 491 PetscInt *sliidx = cudastruct->sliidx; 492 493 PetscFunctionBegin; 494 PetscCheck(a->sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 495 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 496 if (a->nz) { 497 PetscInt nblocks, blocksize = 512; 498 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 499 PetscCall(VecCUDAGetArrayRead(xx, &x)); 500 PetscCall(VecCUDAGetArrayRead(yy, &y)); 501 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 502 PetscCall(PetscLogGpuTimeBegin()); 503 504 switch (cudastruct->kernelchoice) { 505 case 6: 506 nblocks = 1 + (nrows - 1) / (blocksize / 32); 507 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 508 break; 509 case 5: 510 nblocks = 1 + (nrows - 1) / (blocksize / 16); 511 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 512 break; 513 case 4: 514 nblocks = 1 + (nrows - 1) / (blocksize / 8); 515 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 516 break; 517 case 3: 518 nblocks = 1 + (nrows - 1) / (blocksize / 4); 519 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 520 break; 521 case 2: 522 nblocks = 1 + (nrows - 1) / (blocksize / 2); 523 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 524 break; 525 case 1: 526 nblocks = 1 + (nrows - 1) / blocksize; 527 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 528 break; 529 case 0: /* TODO */ 530 break; 531 } 532 PetscCall(PetscLogGpuTimeEnd()); 533 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 534 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 535 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 536 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 537 } else { 538 PetscCall(VecCopy(yy, zz)); 539 } 540 PetscFunctionReturn(PETSC_SUCCESS); 541 } 542 543 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 544 { 545 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 546 PetscInt kernel; 547 PetscBool flg; 548 549 PetscFunctionBegin; 550 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 551 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 552 if (flg) { 553 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 554 cudastruct->kernelchoice = kernel; 555 } 556 PetscOptionsHeadEnd(); 557 PetscFunctionReturn(PETSC_SUCCESS); 558 } 559 560 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 561 { 562 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 563 564 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 565 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 566 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 570 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 571 { 572 PetscFunctionBegin; 573 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 574 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 575 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 576 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 577 A->ops->mult = MatMult_SeqSELLCUDA; 578 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 579 PetscFunctionReturn(PETSC_SUCCESS); 580 } 581 582 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 583 { 584 PetscFunctionBegin; 585 if (A->factortype == MAT_FACTOR_NONE) { 586 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 587 } 588 PetscCall(MatDestroy_SeqSELL(A)); 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 592 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 593 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 594 { 595 PetscFunctionBegin; 596 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 597 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 602 { 603 Mat_SeqSELLCUDA *cudastruct; 604 605 PetscFunctionBegin; 606 PetscCall(PetscFree(B->defaultvectype)); 607 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 608 609 if (!B->spptr) { 610 if (B->factortype == MAT_FACTOR_NONE) { 611 PetscCall(PetscNew(&cudastruct)); 612 B->spptr = cudastruct; 613 } 614 } 615 616 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 617 B->ops->destroy = MatDestroy_SeqSELLCUDA; 618 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 619 B->ops->mult = MatMult_SeqSELLCUDA; 620 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 621 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 622 623 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 624 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 625 626 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 627 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 632 { 633 PetscFunctionBegin; 634 PetscCall(MatCreate_SeqSELL(B)); 635 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 636 PetscFunctionReturn(PETSC_SUCCESS); 637 } 638