1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt maxallocmat; 10 PetscInt totalentries; 11 PetscInt *colidx; /* column index array, device pointer */ 12 MatScalar *val; /* value array, device pointer */ 13 PetscInt totalslices; 14 PetscInt *sliidx; /* slice index array, device pointer */ 15 PetscInt nonzerostate; 16 PetscInt kernelchoice; 17 PetscInt blocky; 18 PetscInt chunksperblock; 19 PetscInt totalchunks; 20 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 21 } Mat_SeqSELLCUDA; 22 23 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 24 { 25 PetscFunctionBegin; 26 if (*cudastruct) { 27 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 28 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 29 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 30 if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); } 31 PetscCall(PetscFree(*cudastruct)); 32 } 33 PetscFunctionReturn(PETSC_SUCCESS); 34 } 35 36 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 37 { 38 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 39 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 40 41 PetscFunctionBegin; 42 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 43 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 44 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 45 /* copy values only */ 46 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 47 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 48 } else { 49 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 50 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 51 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 52 if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); } 53 cudastruct->maxallocmat = a->maxallocmat; 54 cudastruct->totalentries = a->sliidx[a->totalslices]; 55 cudastruct->totalslices = a->totalslices; 56 cudastruct->totalchunks = a->totalchunks; 57 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(*(cudastruct->colidx)))); 58 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(*(cudastruct->val)))); 59 /* copy values, nz or maxallocmat? */ 60 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*(a->colidx)), cudaMemcpyHostToDevice)); 61 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*(a->val)), cudaMemcpyHostToDevice)); 62 63 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(*(cudastruct->sliidx)))); 64 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*(a->sliidx)), cudaMemcpyHostToDevice)); 65 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->chunk_slice_map), a->totalchunks * sizeof(*(cudastruct->chunk_slice_map)))); 66 PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*(a->chunk_slice_map)), cudaMemcpyHostToDevice)); 67 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 68 } 69 PetscCallCUDA(WaitForCUDA()); 70 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 71 A->offloadmask = PETSC_OFFLOAD_BOTH; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 77 { 78 PetscInt i, row, slice_id, row_in_slice; 79 MatScalar sum; 80 /* one thread per row. */ 81 row = blockIdx.x * blockDim.x + threadIdx.x; 82 if (row < nrows) { 83 slice_id = row / sliceheight; 84 row_in_slice = row % sliceheight; 85 sum = 0.0; 86 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 87 y[row] = sum; 88 } 89 } 90 91 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 92 { 93 PetscInt i, row, slice_id, row_in_slice; 94 MatScalar sum; 95 /* one thread per row. */ 96 row = blockIdx.x * blockDim.x + threadIdx.x; 97 if (row < nrows) { 98 slice_id = row / sliceheight; 99 row_in_slice = row % sliceheight; 100 sum = 0.0; 101 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 102 z[row] = y[row] + sum; 103 } 104 } 105 106 #if !defined(PETSC_USE_COMPLEX) 107 /* use 1 block per slice, suitable for large slice width */ 108 template <int BLOCKY> 109 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 110 { 111 __shared__ MatScalar shared[32][BLOCKY]; 112 PetscInt i, row, slice_id = blockIdx.x; 113 int tid = threadIdx.x + threadIdx.y * 32; 114 /* transposed index */ 115 int tidx = tid % BLOCKY; 116 int tidy = tid / BLOCKY; 117 PetscScalar t = 0.0; 118 119 row = slice_id * sliceheight + threadIdx.x % sliceheight; 120 if (row < nrows) { 121 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 122 } 123 #pragma unroll 124 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 125 /* transpose layout to reduce each row using warp shfl */ 126 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 127 __syncthreads(); 128 if (tidy < sliceheight) t = shared[tidy][tidx]; 129 #pragma unroll 130 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 131 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 132 __syncthreads(); 133 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 134 } 135 136 /* use 1 block per slice, suitable for large slice width */ 137 template <int BLOCKY> 138 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 139 { 140 __shared__ MatScalar shared[32][BLOCKY]; 141 PetscInt i, row, slice_id = blockIdx.x; 142 int tid = threadIdx.x + threadIdx.y * 32; 143 /* transposed index */ 144 int tidx = tid % BLOCKY; 145 int tidy = tid / BLOCKY; 146 PetscScalar t = 0.0; 147 148 row = slice_id * sliceheight + threadIdx.x % sliceheight; 149 if (row < nrows) { 150 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 151 } 152 #pragma unroll 153 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 154 /* transpose layout to reduce each row using warp shfl */ 155 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 156 __syncthreads(); 157 if (tidy < sliceheight) t = shared[tidy][tidx]; 158 #pragma unroll 159 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 160 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 161 __syncthreads(); 162 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 163 } 164 165 template <int BLOCKY> 166 __device__ __forceinline__ bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 167 { 168 bool head = true; 169 #pragma unroll 170 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 171 int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16; 172 shared[threadIdx.x + threadIdx.y * 32] = 0; 173 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 174 shared[threadIdx.x + threadIdx.y * 32] = *val; 175 if (i == 1) head = false; 176 } 177 __syncthreads(); 178 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16]; 179 __syncthreads(); 180 } 181 return head; 182 } 183 184 /* load-balancing version. Chunksize is equal to the number of threads per block */ 185 template <int BLOCKY> 186 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 187 { 188 __shared__ MatScalar shared[BLOCKY * 32]; 189 PetscInt gid, row, start_slice, cid; 190 PetscScalar t = 0.0; 191 /* zero out y */ 192 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 193 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 194 if (gid < nrows) y[gid] = 0.0; 195 } 196 for (int iter = 0; iter < chunksperblock; iter++) { 197 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 198 if (cid < totalchunks) { 199 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 200 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 201 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 202 __shared__ PetscInt flag[BLOCKY * 2]; 203 bool write; 204 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 205 /* find out the slice that this element belongs to */ 206 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 207 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 208 row = slice_id * sliceheight + threadIdx.x % sliceheight; 209 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 210 __syncthreads(); 211 write = segment_scan<BLOCKY>(flag, shared, &t); 212 if (row < nrows && gid < totalentries && write) atomicAdd(&y[row], t); 213 t = 0.0; 214 } else { /* this iteration covers only one slice */ 215 row = start_slice * sliceheight + threadIdx.x % sliceheight; 216 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 217 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 218 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 219 /* reduction and write to output vector */ 220 #pragma unroll 221 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 222 /* transpose layout to reduce each row using warp shfl */ 223 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 224 __syncthreads(); 225 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 226 #pragma unroll 227 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 228 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 229 __syncthreads(); 230 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 231 t = 0.0; 232 } 233 } 234 } 235 } 236 } 237 238 /* load-balancing version. Chunksize is equal to the number of threads per block */ 239 template <int BLOCKY> 240 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 241 { 242 __shared__ MatScalar shared[BLOCKY * 32]; 243 PetscInt gid, row, start_slice, cid; 244 PetscScalar t = 0.0; 245 /* copy y to z */ 246 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) { 247 gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 248 if (gid < nrows) z[gid] = y[gid]; 249 } 250 for (int iter = 0; iter < chunksperblock; iter++) { 251 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 252 if (cid < totalchunks) { 253 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 254 gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x; 255 if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 256 __shared__ PetscInt flag[BLOCKY * 2]; 257 bool write; 258 PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices]; 259 /* find out the slice that this element belongs to */ 260 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 261 if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id; 262 row = slice_id * sliceheight + threadIdx.x % sliceheight; 263 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 264 __syncthreads(); 265 write = segment_scan<BLOCKY>(flag, shared, &t); 266 if (row < nrows && gid < totalentries && write) atomicAdd(&z[row], t); 267 t = 0.0; 268 } else { /* this iteration covers only one slice */ 269 row = start_slice * sliceheight + threadIdx.x % sliceheight; 270 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 271 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 272 int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 273 /* reduction and write to output vector */ 274 #pragma unroll 275 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 276 /* transpose layout to reduce each row using warp shfl */ 277 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 278 __syncthreads(); 279 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 280 #pragma unroll 281 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 282 if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ } 283 __syncthreads(); 284 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomicAdd(&z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 285 t = 0.0; 286 } 287 } 288 } 289 } 290 } 291 292 /* use 1 warp per slice, suitable for small slice width */ 293 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 294 { 295 PetscInt i, row, slice_id; 296 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 297 row = slice_id * sliceheight + threadIdx.x % sliceheight; 298 double t = 0.0; 299 if (row < nrows) { 300 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 301 } 302 #pragma unroll 303 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 304 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 305 } 306 307 /* use 1 warp per slice, suitable for small slice width */ 308 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 309 { 310 PetscInt i, row, slice_id; 311 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 312 row = slice_id * sliceheight + threadIdx.x % sliceheight; 313 double t = 0.0; 314 if (row < nrows) { 315 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 316 } 317 #pragma unroll 318 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 319 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 320 } 321 #endif 322 323 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 324 325 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 326 { 327 __shared__ MatScalar shared[512]; 328 PetscInt i, row, slice_id, row_in_slice; 329 /* multiple threads per row. */ 330 row = blockIdx.x * blockDim.x + threadIdx.x; 331 if (row < nrows) { 332 slice_id = row / SLICE_HEIGHT; 333 row_in_slice = row % SLICE_HEIGHT; 334 335 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 336 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 337 __syncthreads(); 338 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 339 __syncthreads(); 340 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 341 __syncthreads(); 342 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 343 __syncthreads(); 344 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 345 __syncthreads(); 346 if (threadIdx.y < 1) { 347 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 348 y[row] = shared[threadIdx.x]; 349 } 350 } 351 } 352 353 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 354 { 355 __shared__ MatScalar shared[512]; 356 PetscInt i, row, slice_id, row_in_slice; 357 /* multiple threads per row. */ 358 row = blockIdx.x * blockDim.x + threadIdx.x; 359 if (row < nrows) { 360 slice_id = row / SLICE_HEIGHT; 361 row_in_slice = row % SLICE_HEIGHT; 362 363 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 364 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 365 __syncthreads(); 366 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 367 __syncthreads(); 368 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 369 __syncthreads(); 370 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 371 __syncthreads(); 372 if (threadIdx.y < 1) { 373 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 374 y[row] = shared[threadIdx.x]; 375 } 376 } 377 } 378 379 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 380 { 381 __shared__ MatScalar shared[512]; 382 PetscInt i, row, slice_id, row_in_slice; 383 /* multiple threads per row. */ 384 row = blockIdx.x * blockDim.x + threadIdx.x; 385 if (row < nrows) { 386 slice_id = row / SLICE_HEIGHT; 387 row_in_slice = row % SLICE_HEIGHT; 388 389 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 390 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 391 __syncthreads(); 392 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 393 __syncthreads(); 394 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 395 __syncthreads(); 396 if (threadIdx.y < 1) { 397 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 398 y[row] = shared[threadIdx.x]; 399 } 400 } 401 } 402 403 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 404 { 405 __shared__ MatScalar shared[512]; 406 PetscInt i, row, slice_id, row_in_slice; 407 /* multiple threads per row. */ 408 row = blockIdx.x * blockDim.x + threadIdx.x; 409 if (row < nrows) { 410 slice_id = row / SLICE_HEIGHT; 411 row_in_slice = row % SLICE_HEIGHT; 412 413 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 414 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 415 __syncthreads(); 416 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 417 __syncthreads(); 418 if (threadIdx.y < 1) { 419 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 420 y[row] = shared[threadIdx.x]; 421 } 422 } 423 } 424 425 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 426 { 427 __shared__ MatScalar shared[512]; 428 PetscInt i, row, slice_id, row_in_slice; 429 /* multiple threads per row. */ 430 row = blockIdx.x * blockDim.x + threadIdx.x; 431 if (row < nrows) { 432 slice_id = row / SLICE_HEIGHT; 433 row_in_slice = row % SLICE_HEIGHT; 434 435 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 436 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 437 __syncthreads(); 438 if (threadIdx.y < 1) { 439 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 440 y[row] = shared[threadIdx.x]; 441 } 442 } 443 } 444 445 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 446 { 447 __shared__ MatScalar shared[512]; 448 PetscInt i, row, slice_id, row_in_slice; 449 /* multiple threads per row. */ 450 row = blockIdx.x * blockDim.x + threadIdx.x; 451 if (row < nrows) { 452 slice_id = row / SLICE_HEIGHT; 453 row_in_slice = row % SLICE_HEIGHT; 454 455 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 456 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 457 __syncthreads(); 458 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 459 __syncthreads(); 460 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 461 __syncthreads(); 462 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 463 __syncthreads(); 464 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 465 __syncthreads(); 466 if (threadIdx.y < 1) { 467 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 468 z[row] = y[row] + shared[threadIdx.x]; 469 } 470 } 471 } 472 473 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 474 { 475 __shared__ MatScalar shared[512]; 476 PetscInt i, row, slice_id, row_in_slice; 477 /* multiple threads per row. */ 478 row = blockIdx.x * blockDim.x + threadIdx.x; 479 if (row < nrows) { 480 slice_id = row / SLICE_HEIGHT; 481 row_in_slice = row % SLICE_HEIGHT; 482 483 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 484 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 485 __syncthreads(); 486 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 487 __syncthreads(); 488 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 489 __syncthreads(); 490 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 491 __syncthreads(); 492 if (threadIdx.y < 1) { 493 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 494 z[row] = y[row] + shared[threadIdx.x]; 495 } 496 } 497 } 498 499 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 500 { 501 __shared__ MatScalar shared[512]; 502 PetscInt i, row, slice_id, row_in_slice; 503 /* multiple threads per row. */ 504 row = blockIdx.x * blockDim.x + threadIdx.x; 505 if (row < nrows) { 506 slice_id = row / SLICE_HEIGHT; 507 row_in_slice = row % SLICE_HEIGHT; 508 509 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 510 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 511 __syncthreads(); 512 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 513 __syncthreads(); 514 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 515 __syncthreads(); 516 if (threadIdx.y < 1) { 517 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 518 z[row] = y[row] + shared[threadIdx.x]; 519 } 520 } 521 } 522 523 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 524 { 525 __shared__ MatScalar shared[512]; 526 PetscInt i, row, slice_id, row_in_slice; 527 /* multiple threads per row. */ 528 row = blockIdx.x * blockDim.x + threadIdx.x; 529 if (row < nrows) { 530 slice_id = row / SLICE_HEIGHT; 531 row_in_slice = row % SLICE_HEIGHT; 532 533 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 534 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 535 __syncthreads(); 536 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 537 __syncthreads(); 538 if (threadIdx.y < 1) { 539 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 540 z[row] = y[row] + shared[threadIdx.x]; 541 } 542 } 543 } 544 545 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 546 { 547 __shared__ MatScalar shared[512]; 548 PetscInt i, row, slice_id, row_in_slice; 549 /* multiple threads per row. */ 550 row = blockIdx.x * blockDim.x + threadIdx.x; 551 if (row < nrows) { 552 slice_id = row / SLICE_HEIGHT; 553 row_in_slice = row % SLICE_HEIGHT; 554 555 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 556 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 557 __syncthreads(); 558 if (threadIdx.y < 1) { 559 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 560 z[row] = y[row] + shared[threadIdx.x]; 561 } 562 } 563 } 564 565 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 566 { 567 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 568 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 569 PetscScalar *y; 570 const PetscScalar *x; 571 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 572 PetscInt chunksperblock, nchunks, *chunk_slice_map; 573 MatScalar *aval; 574 PetscInt *acolidx; 575 PetscInt *sliidx; 576 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 577 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 578 PetscReal maxoveravg; 579 580 PetscFunctionBegin; 581 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 582 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 583 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 584 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 585 aval = cudastruct->val; 586 acolidx = cudastruct->colidx; 587 sliidx = cudastruct->sliidx; 588 589 PetscCall(VecCUDAGetArrayRead(xx, &x)); 590 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 591 PetscCall(PetscLogGpuTimeBegin()); 592 593 switch (cudastruct->kernelchoice) { 594 #if !defined(PETSC_USE_COMPLEX) 595 case 9: 596 nblocks = 1 + (nrows - 1) / sliceheight; 597 if (cudastruct->blocky == 2) { 598 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 599 } else if (cudastruct->blocky == 4) { 600 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 601 } else if (cudastruct->blocky == 8) { 602 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 603 } else if (cudastruct->blocky == 16) { 604 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 605 } else if (cudastruct->blocky == 32) { 606 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 607 } else { 608 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 609 } 610 break; 611 case 7: 612 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 613 if (cudastruct->blocky == 2) { 614 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 615 } else if (cudastruct->blocky == 4) { 616 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 617 } else if (cudastruct->blocky == 8) { 618 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 619 } else if (cudastruct->blocky == 16) { 620 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 621 } else if (cudastruct->blocky == 32) { 622 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 623 } else { 624 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 625 } 626 break; 627 #endif 628 case 6: 629 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 630 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 631 break; 632 case 5: 633 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 634 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 635 break; 636 case 4: 637 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 638 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 639 break; 640 case 3: 641 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 642 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 643 break; 644 case 2: /* 16 slices per block if blocksize=512 */ 645 nblocks = 1 + (nrows - 1) / (blocksize / 2); 646 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 647 break; 648 case 1: /* 32 slices per block if blocksize=512 */ 649 nblocks = 1 + (nrows - 1) / blocksize; 650 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 651 break; 652 #if !defined(PETSC_USE_COMPLEX) 653 case 0: 654 maxoveravg = a->maxslicewidth / a->avgslicewidth; 655 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 656 /* each block handles approximately one slice */ 657 PetscInt blocky = a->chunksize / 32; 658 nchunks = cudastruct->totalchunks; 659 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 660 nblocks = 1 + (nchunks - 1) / chunksperblock; 661 chunk_slice_map = cudastruct->chunk_slice_map; 662 if (blocky == 2) { 663 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 664 } else if (blocky == 4) { 665 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 666 } else if (blocky == 8) { 667 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 668 } else if (blocky == 16) { 669 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 670 } else if (blocky == 32) { 671 matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 672 } else { 673 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 674 } 675 } else { 676 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 677 if (avgslicesize <= 432) { 678 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 679 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 680 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 681 } else { 682 nblocks = 1 + (nrows - 1) / sliceheight; 683 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 684 } 685 } else if (avgslicesize <= 2400) { 686 nblocks = 1 + (nrows - 1) / sliceheight; 687 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 688 } else { 689 nblocks = 1 + (nrows - 1) / sliceheight; 690 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691 } 692 } 693 break; 694 #endif 695 default: 696 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 697 } 698 PetscCall(PetscLogGpuTimeEnd()); 699 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 700 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 701 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 702 PetscFunctionReturn(PETSC_SUCCESS); 703 } 704 705 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 706 { 707 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 708 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 709 PetscScalar *z; 710 const PetscScalar *y, *x; 711 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 712 PetscInt chunksperblock, nchunks, *chunk_slice_map; 713 MatScalar *aval = cudastruct->val; 714 PetscInt *acolidx = cudastruct->colidx; 715 PetscInt *sliidx = cudastruct->sliidx; 716 PetscReal maxoveravg; 717 718 PetscFunctionBegin; 719 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 720 PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 721 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 722 if (a->nz) { 723 PetscInt blocky = cudastruct->blocky, nblocks, blocksize = 512; 724 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 725 PetscCall(VecCUDAGetArrayRead(xx, &x)); 726 PetscCall(VecCUDAGetArrayRead(yy, &y)); 727 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 728 PetscCall(PetscLogGpuTimeBegin()); 729 730 switch (cudastruct->kernelchoice) { 731 #if !defined(PETSC_USE_COMPLEX) 732 case 9: 733 nblocks = 1 + (nrows - 1) / sliceheight; 734 if (blocky == 2) { 735 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 736 } else if (blocky == 4) { 737 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 738 } else if (blocky == 8) { 739 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 740 } else if (blocky == 16) { 741 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 742 } else if (blocky == 32) { 743 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 744 } else { 745 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 746 } 747 break; 748 case 8: 749 /* each block handles approximately one slice */ 750 nchunks = cudastruct->totalchunks; 751 blocky = a->chunksize / 32; 752 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 753 nblocks = 1 + (nchunks - 1) / chunksperblock; 754 chunk_slice_map = cudastruct->chunk_slice_map; 755 if (blocky == 2) { 756 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 757 } else if (blocky == 4) { 758 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 759 } else if (blocky == 8) { 760 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 761 } else if (blocky == 16) { 762 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 763 } else if (blocky == 32) { 764 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 765 } else { 766 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 767 } 768 break; 769 case 7: 770 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 771 if (blocky == 2) { 772 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 773 } else if (blocky == 4) { 774 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 775 } else if (blocky == 8) { 776 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 777 } else if (blocky == 16) { 778 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 779 } else if (blocky == 32) { 780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 781 } else { 782 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 783 } 784 break; 785 #endif 786 case 6: 787 nblocks = 1 + (nrows - 1) / (blocksize / 32); 788 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 789 break; 790 case 5: 791 nblocks = 1 + (nrows - 1) / (blocksize / 16); 792 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 793 break; 794 case 4: 795 nblocks = 1 + (nrows - 1) / (blocksize / 8); 796 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 797 break; 798 case 3: 799 nblocks = 1 + (nrows - 1) / (blocksize / 4); 800 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 801 break; 802 case 2: 803 nblocks = 1 + (nrows - 1) / (blocksize / 2); 804 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 805 break; 806 case 1: 807 nblocks = 1 + (nrows - 1) / blocksize; 808 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 809 break; 810 #if !defined(PETSC_USE_COMPLEX) 811 case 0: 812 maxoveravg = a->maxslicewidth / a->avgslicewidth; 813 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 814 /* each block handles approximately one slice */ 815 nchunks = cudastruct->totalchunks; 816 blocky = a->chunksize / 32; 817 chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize; 818 nblocks = 1 + (nchunks - 1) / chunksperblock; 819 chunk_slice_map = cudastruct->chunk_slice_map; 820 if (blocky == 2) { 821 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 822 } else if (blocky == 4) { 823 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 824 } else if (blocky == 8) { 825 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 826 } else if (blocky == 16) { 827 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 828 } else if (blocky == 32) { 829 matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 830 } else { 831 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 832 } 833 } else { 834 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 835 if (avgslicesize <= 432) { 836 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 837 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 838 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 839 } else { 840 nblocks = 1 + (nrows - 1) / sliceheight; 841 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 842 } 843 } else if (avgslicesize <= 2400) { 844 nblocks = 1 + (nrows - 1) / sliceheight; 845 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 846 } else { 847 nblocks = 1 + (nrows - 1) / sliceheight; 848 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 849 } 850 } 851 break; 852 #endif 853 default: 854 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice); 855 } 856 PetscCall(PetscLogGpuTimeEnd()); 857 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 858 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 859 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 860 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 861 } else { 862 PetscCall(VecCopy(yy, zz)); 863 } 864 PetscFunctionReturn(PETSC_SUCCESS); 865 } 866 867 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 868 { 869 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 870 PetscInt kernel, blocky; 871 PetscBool flg; 872 873 PetscFunctionBegin; 874 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 875 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 876 if (flg) { 877 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 878 cudastruct->blocky = blocky; 879 } 880 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 881 if (flg) { 882 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 883 cudastruct->kernelchoice = kernel; 884 if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); } 885 } 886 PetscOptionsHeadEnd(); 887 PetscFunctionReturn(PETSC_SUCCESS); 888 } 889 890 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 891 { 892 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 893 894 PetscFunctionBegin; 895 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 896 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 897 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 898 PetscFunctionReturn(PETSC_SUCCESS); 899 } 900 901 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 902 { 903 PetscFunctionBegin; 904 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 905 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 906 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 907 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 908 A->ops->mult = MatMult_SeqSELLCUDA; 909 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 910 PetscFunctionReturn(PETSC_SUCCESS); 911 } 912 913 static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A) 914 { 915 PetscBool both = PETSC_FALSE; 916 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 917 918 PetscFunctionBegin; 919 if (A->factortype == MAT_FACTOR_NONE) { 920 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 921 if (cudastruct->val) { 922 both = PETSC_TRUE; 923 PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*(cudastruct->val)))); 924 } 925 } 926 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 927 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 928 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 929 else A->offloadmask = PETSC_OFFLOAD_CPU; 930 PetscFunctionReturn(PETSC_SUCCESS); 931 } 932 933 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 934 { 935 PetscFunctionBegin; 936 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); 937 PetscCall(MatDestroy_SeqSELL(A)); 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 942 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 943 { 944 PetscFunctionBegin; 945 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 946 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 947 PetscFunctionReturn(PETSC_SUCCESS); 948 } 949 950 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 951 { 952 Mat_SeqSELLCUDA *cudastruct; 953 954 PetscFunctionBegin; 955 PetscCall(PetscFree(B->defaultvectype)); 956 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 957 958 if (!B->spptr) { 959 if (B->factortype == MAT_FACTOR_NONE) { 960 PetscCall(PetscNew(&cudastruct)); 961 B->spptr = cudastruct; 962 } 963 } 964 965 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 966 B->ops->destroy = MatDestroy_SeqSELLCUDA; 967 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 968 B->ops->mult = MatMult_SeqSELLCUDA; 969 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 970 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 971 B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA; 972 973 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 974 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 975 976 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 977 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 978 PetscFunctionReturn(PETSC_SUCCESS); 979 } 980 981 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 982 { 983 PetscFunctionBegin; 984 PetscCall(MatCreate_SeqSELL(B)); 985 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 986 PetscCall(MatSetFromOptions(B)); 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989