1 #include "hip/hip_runtime.h" 2 #include <hip/hip_runtime.h> 3 4 #include <petscdevice_hip.h> 5 #include <petsc/private/cupmatomics.hpp> 6 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 7 8 #define WARP_SIZE 64 9 10 typedef struct { 11 PetscInt maxallocmat; 12 PetscInt totalentries; 13 PetscInt *colidx; /* column index array, device pointer */ 14 MatScalar *val; /* value array, device pointer */ 15 PetscInt totalslices; 16 PetscInt *sliidx; /* slice index array, device pointer */ 17 PetscInt nonzerostate; 18 PetscInt kernelchoice; 19 PetscInt blocky; 20 PetscInt chunksperblock; 21 PetscInt totalchunks; 22 PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 23 } Mat_SeqSELLHIP; 24 25 static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct) 26 { 27 PetscFunctionBegin; 28 if (*hipstruct) { 29 if ((*hipstruct)->colidx) PetscCallHIP(hipFree((*hipstruct)->colidx)); 30 if ((*hipstruct)->val) PetscCallHIP(hipFree((*hipstruct)->val)); 31 if ((*hipstruct)->sliidx) PetscCallHIP(hipFree((*hipstruct)->sliidx)); 32 if ((*hipstruct)->chunk_slice_map) PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); 33 PetscCall(PetscFree(*hipstruct)); 34 } 35 PetscFunctionReturn(PETSC_SUCCESS); 36 } 37 38 static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A) 39 { 40 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 41 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 42 43 PetscFunctionBegin; 44 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 45 PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0)); 46 if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) { 47 /* copy values only */ 48 PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice)); 49 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 50 } else { 51 if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx)); 52 if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val)); 53 if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx)); 54 if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map)); 55 hipstruct->maxallocmat = a->maxallocmat; 56 hipstruct->totalentries = a->sliidx[a->totalslices]; 57 hipstruct->totalslices = a->totalslices; 58 hipstruct->totalchunks = a->totalchunks; 59 PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx))); 60 PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val))); 61 /* copy values, nz or maxallocmat? */ 62 PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice)); 63 PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice)); 64 65 PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx))); 66 PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice)); 67 PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map))); 68 PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice)); 69 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 70 } 71 PetscCallHIP(WaitForHIP()); 72 PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0)); 73 A->offloadmask = PETSC_OFFLOAD_BOTH; 74 } 75 PetscFunctionReturn(PETSC_SUCCESS); 76 } 77 78 static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 79 { 80 PetscInt i, row, slice_id, row_in_slice; 81 MatScalar sum; 82 /* one thread per row. */ 83 row = blockIdx.x * blockDim.x + threadIdx.x; 84 if (row < nrows) { 85 slice_id = row / sliceheight; 86 row_in_slice = row % sliceheight; 87 sum = 0.0; 88 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 89 y[row] = sum; 90 } 91 } 92 93 static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 94 { 95 PetscInt i, row, slice_id, row_in_slice; 96 MatScalar sum; 97 /* one thread per row. */ 98 row = blockIdx.x * blockDim.x + threadIdx.x; 99 if (row < nrows) { 100 slice_id = row / sliceheight; 101 row_in_slice = row % sliceheight; 102 sum = 0.0; 103 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 104 z[row] = y[row] + sum; 105 } 106 } 107 108 #if !defined(PETSC_USE_COMPLEX) 109 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed") 110 /* use 1 block per slice, suitable for large slice width */ 111 template <int BLOCKY> 112 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 113 { 114 __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; 115 PetscInt i, row, slice_id = blockIdx.x; 116 int tid = threadIdx.x + threadIdx.y * WARP_SIZE; 117 /* transposed index */ 118 int tidx = tid % BLOCKY; 119 int tidy = tid / BLOCKY; 120 PetscScalar t = 0.0; 121 122 row = slice_id * sliceheight + threadIdx.x % sliceheight; 123 if (row < nrows) { 124 for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]]; 125 } 126 #pragma unroll 127 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 128 /* transpose layout to reduce each row using warp shfl */ 129 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 130 __syncthreads(); 131 if (tidy < sliceheight) t = shared[tidy][tidx]; 132 #pragma unroll 133 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 134 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 135 __syncthreads(); 136 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; 137 } 138 139 /* use 1 block per slice, suitable for large slice width */ 140 template <int BLOCKY> 141 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 142 { 143 __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; 144 PetscInt i, row, slice_id = blockIdx.x; 145 int tid = threadIdx.x + threadIdx.y * WARP_SIZE; 146 /* transposed index */ 147 int tidx = tid % BLOCKY; 148 int tidy = tid / BLOCKY; 149 PetscScalar t = 0.0; 150 151 row = slice_id * sliceheight + threadIdx.x % sliceheight; 152 if (row < nrows) { 153 for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]]; 154 } 155 #pragma unroll 156 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 157 /* transpose layout to reduce each row using warp shfl */ 158 if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 159 __syncthreads(); 160 if (tidy < sliceheight) t = shared[tidy][tidx]; 161 #pragma unroll 162 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 163 if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 164 __syncthreads(); 165 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x]; 166 } 167 168 template <int BLOCKY> 169 __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 170 { 171 bool head = true; 172 #pragma unroll 173 for (int i = 1; i < BLOCKY * 2; i <<= 1) { 174 int halfwarpid = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2); 175 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0; 176 if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 177 shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val; 178 if (i == 1) head = false; 179 } 180 __syncthreads(); 181 if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE]; 182 __syncthreads(); 183 } 184 return head; 185 } 186 187 /* load-balancing version. Chunksize is equal to the number of threads per block */ 188 template <int BLOCKY> 189 __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 190 { 191 __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; 192 PetscInt gid, row, start_slice, cid; 193 PetscScalar t = 0.0; 194 AtomicAdd<MatScalar> atomAdd; 195 /* zero out y */ 196 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) { 197 gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 198 if (gid < nrows) y[gid] = 0.0; 199 } 200 for (int iter = 0; iter < chunksperblock; iter++) { 201 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 202 if (cid < totalchunks) { 203 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 204 gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 205 if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 206 __shared__ PetscInt flag[BLOCKY * 2]; 207 bool write; 208 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 209 /* find out the slice that this element belongs to */ 210 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 211 if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id; 212 row = slice_id * sliceheight + threadIdx.x % sliceheight; 213 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 214 __syncthreads(); 215 write = segment_scan<BLOCKY>(flag, shared, &t); 216 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 217 t = 0.0; 218 } else { /* this iteration covers only one slice */ 219 row = start_slice * sliceheight + threadIdx.x % sliceheight; 220 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 221 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 222 int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 223 /* reduction and write to output vector */ 224 #pragma unroll 225 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 226 /* transpose layout to reduce each row using warp shfl */ 227 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 228 __syncthreads(); 229 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 230 #pragma unroll 231 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 232 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 233 __syncthreads(); 234 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 235 t = 0.0; 236 } 237 } 238 } 239 } 240 } 241 242 /* load-balancing version. Chunksize is equal to the number of threads per block */ 243 template <int BLOCKY> 244 __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 245 { 246 __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; 247 PetscInt gid, row, start_slice, cid; 248 PetscScalar t = 0.0; 249 AtomicAdd<MatScalar> atomAdd; 250 /* copy y to z */ 251 for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) { 252 gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 253 if (gid < nrows) z[gid] = y[gid]; 254 } 255 for (int iter = 0; iter < chunksperblock; iter++) { 256 cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 257 if (cid < totalchunks) { 258 start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 259 gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 260 if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 261 __shared__ PetscInt flag[BLOCKY * 2]; 262 bool write; 263 PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 264 /* find out the slice that this element belongs to */ 265 while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 266 if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id; 267 row = slice_id * sliceheight + threadIdx.x % sliceheight; 268 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 269 __syncthreads(); 270 write = segment_scan<BLOCKY>(flag, shared, &t); 271 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 272 t = 0.0; 273 } else { /* this iteration covers only one slice */ 274 row = start_slice * sliceheight + threadIdx.x % sliceheight; 275 if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 276 if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 277 int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 278 /* reduction and write to output vector */ 279 #pragma unroll 280 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 281 /* transpose layout to reduce each row using warp shfl */ 282 if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 283 __syncthreads(); 284 if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 285 #pragma unroll 286 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 287 if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 288 __syncthreads(); 289 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 290 t = 0.0; 291 } 292 } 293 } 294 } 295 } 296 297 /* use 1 warp per slice, suitable for small slice width */ 298 static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 299 { 300 PetscInt i, row, slice_id; 301 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 302 row = slice_id * sliceheight + threadIdx.x % sliceheight; 303 double t = 0.0; 304 if (row < nrows) { 305 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]]; 306 } 307 #pragma unroll 308 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 309 if (row < nrows && threadIdx.x < sliceheight) y[row] = t; 310 } 311 312 /* use 1 warp per slice, suitable for small slice width */ 313 static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 314 { 315 PetscInt i, row, slice_id; 316 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 317 row = slice_id * sliceheight + threadIdx.x % sliceheight; 318 double t = 0.0; 319 if (row < nrows) { 320 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]]; 321 } 322 #pragma unroll 323 for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 324 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; 325 } 326 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 327 #endif 328 329 /*********** Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison **********/ 330 331 static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 332 { 333 __shared__ MatScalar shared[32 * 16]; 334 PetscInt i, row, slice_id, row_in_slice; 335 /* multiple threads per row. */ 336 row = blockIdx.x * blockDim.x + threadIdx.x; 337 if (row < nrows) { 338 slice_id = row / sliceheight; 339 row_in_slice = row % sliceheight; 340 341 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 342 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 343 __syncthreads(); 344 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 345 __syncthreads(); 346 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 347 __syncthreads(); 348 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 349 __syncthreads(); 350 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 351 __syncthreads(); 352 if (threadIdx.y < 1) { 353 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 354 y[row] = shared[threadIdx.x]; 355 } 356 } 357 } 358 359 static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 360 { 361 __shared__ MatScalar shared[32 * 16]; 362 PetscInt i, row, slice_id, row_in_slice; 363 /* multiple threads per row. */ 364 row = blockIdx.x * blockDim.x + threadIdx.x; 365 if (row < nrows) { 366 slice_id = row / sliceheight; 367 row_in_slice = row % sliceheight; 368 369 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 370 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 371 __syncthreads(); 372 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 373 __syncthreads(); 374 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 375 __syncthreads(); 376 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 377 __syncthreads(); 378 if (threadIdx.y < 1) { 379 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 380 y[row] = shared[threadIdx.x]; 381 } 382 } 383 } 384 385 static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 386 { 387 __shared__ MatScalar shared[32 * 16]; 388 PetscInt i, row, slice_id, row_in_slice; 389 /* multiple threads per row. */ 390 row = blockIdx.x * blockDim.x + threadIdx.x; 391 if (row < nrows) { 392 slice_id = row / sliceheight; 393 row_in_slice = row % sliceheight; 394 395 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 396 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 397 __syncthreads(); 398 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 399 __syncthreads(); 400 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 401 __syncthreads(); 402 if (threadIdx.y < 1) { 403 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 404 y[row] = shared[threadIdx.x]; 405 } 406 } 407 } 408 409 static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 410 { 411 __shared__ MatScalar shared[32 * 16]; 412 PetscInt i, row, slice_id, row_in_slice; 413 /* multiple threads per row. */ 414 row = blockIdx.x * blockDim.x + threadIdx.x; 415 if (row < nrows) { 416 slice_id = row / sliceheight; 417 row_in_slice = row % sliceheight; 418 419 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 420 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 421 __syncthreads(); 422 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 423 __syncthreads(); 424 if (threadIdx.y < 1) { 425 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 426 y[row] = shared[threadIdx.x]; 427 } 428 } 429 } 430 431 static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 432 { 433 __shared__ MatScalar shared[32 * 16]; 434 PetscInt i, row, slice_id, row_in_slice; 435 /* multiple threads per row. */ 436 row = blockIdx.x * blockDim.x + threadIdx.x; 437 if (row < nrows) { 438 slice_id = row / sliceheight; 439 row_in_slice = row % sliceheight; 440 441 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 442 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 443 __syncthreads(); 444 if (threadIdx.y < 1) { 445 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 446 y[row] = shared[threadIdx.x]; 447 } 448 } 449 } 450 451 static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 452 { 453 __shared__ MatScalar shared[32 * 16]; 454 PetscInt i, row, slice_id, row_in_slice; 455 /* multiple threads per row. */ 456 row = blockIdx.x * blockDim.x + threadIdx.x; 457 if (row < nrows) { 458 slice_id = row / sliceheight; 459 row_in_slice = row % sliceheight; 460 461 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 462 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 463 __syncthreads(); 464 if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 465 __syncthreads(); 466 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 467 __syncthreads(); 468 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 469 __syncthreads(); 470 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 471 __syncthreads(); 472 if (threadIdx.y < 1) { 473 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 474 z[row] = y[row] + shared[threadIdx.x]; 475 } 476 } 477 } 478 479 static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 480 { 481 __shared__ MatScalar shared[32 * 16]; 482 PetscInt i, row, slice_id, row_in_slice; 483 /* multiple threads per row. */ 484 row = blockIdx.x * blockDim.x + threadIdx.x; 485 if (row < nrows) { 486 slice_id = row / sliceheight; 487 row_in_slice = row % sliceheight; 488 489 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 490 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 491 __syncthreads(); 492 if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 493 __syncthreads(); 494 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 495 __syncthreads(); 496 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 497 __syncthreads(); 498 if (threadIdx.y < 1) { 499 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 500 z[row] = y[row] + shared[threadIdx.x]; 501 } 502 } 503 } 504 505 static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 506 { 507 __shared__ MatScalar shared[32 * 16]; 508 PetscInt i, row, slice_id, row_in_slice; 509 /* multiple threads per row. */ 510 row = blockIdx.x * blockDim.x + threadIdx.x; 511 if (row < nrows) { 512 slice_id = row / sliceheight; 513 row_in_slice = row % sliceheight; 514 515 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 516 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 517 __syncthreads(); 518 if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 519 __syncthreads(); 520 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 521 __syncthreads(); 522 if (threadIdx.y < 1) { 523 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 524 z[row] = y[row] + shared[threadIdx.x]; 525 } 526 } 527 } 528 529 static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 530 { 531 __shared__ MatScalar shared[32 * 16]; 532 PetscInt i, row, slice_id, row_in_slice; 533 /* multiple threads per row. */ 534 row = blockIdx.x * blockDim.x + threadIdx.x; 535 if (row < nrows) { 536 slice_id = row / sliceheight; 537 row_in_slice = row % sliceheight; 538 539 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 540 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 541 __syncthreads(); 542 if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 543 __syncthreads(); 544 if (threadIdx.y < 1) { 545 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 546 z[row] = y[row] + shared[threadIdx.x]; 547 } 548 } 549 } 550 551 static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 552 { 553 __shared__ MatScalar shared[32 * 16]; 554 PetscInt i, row, slice_id, row_in_slice; 555 /* multiple threads per row. */ 556 row = blockIdx.x * blockDim.x + threadIdx.x; 557 if (row < nrows) { 558 slice_id = row / sliceheight; 559 row_in_slice = row % sliceheight; 560 561 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 562 for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 563 __syncthreads(); 564 if (threadIdx.y < 1) { 565 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 566 z[row] = y[row] + shared[threadIdx.x]; 567 } 568 } 569 } 570 571 static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy) 572 { 573 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 574 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 575 PetscScalar *y; 576 const PetscScalar *x; 577 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 578 MatScalar *aval; 579 PetscInt *acolidx; 580 PetscInt *sliidx; 581 PetscInt nblocks, blocksize = 512; /* blocksize is fixed to be 512 */ 582 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 583 #if !defined(PETSC_USE_COMPLEX) 584 PetscInt chunksperblock, nchunks, *chunk_slice_map; 585 PetscReal maxoveravg; 586 #endif 587 588 PetscFunctionBegin; 589 PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 590 PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight); 591 PetscCall(MatSeqSELLHIPCopyToGPU(A)); 592 /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */ 593 aval = hipstruct->val; 594 acolidx = hipstruct->colidx; 595 sliidx = hipstruct->sliidx; 596 597 PetscCall(VecHIPGetArrayRead(xx, &x)); 598 PetscCall(VecHIPGetArrayWrite(yy, &y)); 599 PetscCall(PetscLogGpuTimeBegin()); 600 601 switch (hipstruct->kernelchoice) { 602 #if !defined(PETSC_USE_COMPLEX) 603 case 9: /* 1 slice per block */ 604 nblocks = 1 + (nrows - 1) / sliceheight; 605 if (hipstruct->blocky == 2) { 606 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 607 } else if (hipstruct->blocky == 4) { 608 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 609 } else if (hipstruct->blocky == 8) { 610 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 611 } else if (hipstruct->blocky == 16) { 612 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 613 } else { 614 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 615 } 616 break; 617 case 7: /* each block handles blocky slices */ 618 nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight); 619 if (hipstruct->blocky == 2) { 620 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 621 } else if (hipstruct->blocky == 4) { 622 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 623 } else if (hipstruct->blocky == 8) { 624 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 625 } else if (hipstruct->blocky == 16) { 626 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 627 } else { 628 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 629 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 630 } 631 break; 632 #endif 633 case 6: 634 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */ 635 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 636 break; 637 case 5: 638 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/ 639 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 640 break; 641 case 4: 642 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */ 643 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 644 break; 645 case 3: 646 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */ 647 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 648 break; 649 case 2: /* 16 slices per block if sliceheight=32 */ 650 nblocks = 1 + (nrows - 1) / (blocksize / 2); 651 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 652 break; 653 case 1: /* 32 slices per block if sliceheight=32 */ 654 nblocks = 1 + (nrows - 1) / blocksize; 655 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 656 break; 657 #if !defined(PETSC_USE_COMPLEX) 658 case 0: 659 maxoveravg = a->maxslicewidth / a->avgslicewidth; 660 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 661 /* each block handles approximately one slice */ 662 PetscInt blocky = a->chunksize / 32; 663 nchunks = hipstruct->totalchunks; 664 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 665 nblocks = 1 + (nchunks - 1) / chunksperblock; 666 chunk_slice_map = hipstruct->chunk_slice_map; 667 if (blocky == 2) { 668 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669 } else if (blocky == 4) { 670 matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 671 } else if (blocky == 8) { 672 matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 673 } else if (blocky == 16) { 674 matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 675 } else { 676 matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 677 } 678 } else { 679 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 680 if (avgslicesize <= 432) { 681 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 682 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 683 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 684 } else { 685 nblocks = 1 + (nrows - 1) / sliceheight; 686 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 687 } 688 } else if (avgslicesize <= 2400) { 689 nblocks = 1 + (nrows - 1) / sliceheight; 690 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691 } else { 692 nblocks = 1 + (nrows - 1) / sliceheight; 693 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 694 } 695 } 696 break; 697 #endif 698 default: 699 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice); 700 } 701 PetscCall(PetscLogGpuTimeEnd()); 702 PetscCall(VecHIPRestoreArrayRead(xx, &x)); 703 PetscCall(VecHIPRestoreArrayWrite(yy, &y)); 704 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz) 709 { 710 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 711 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 712 PetscScalar *z; 713 const PetscScalar *y, *x; 714 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 715 MatScalar *aval = hipstruct->val; 716 PetscInt *acolidx = hipstruct->colidx; 717 PetscInt *sliidx = hipstruct->sliidx; 718 #if !defined(PETSC_USE_COMPLEX) 719 PetscReal maxoveravg; 720 PetscInt chunksperblock, nchunks, *chunk_slice_map; 721 PetscInt blocky = hipstruct->blocky; 722 #endif 723 724 PetscFunctionBegin; 725 PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 726 PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 727 PetscCall(MatSeqSELLHIPCopyToGPU(A)); 728 if (a->nz) { 729 PetscInt nblocks, blocksize = 512; 730 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 731 PetscCall(VecHIPGetArrayRead(xx, &x)); 732 PetscCall(VecHIPGetArrayRead(yy, &y)); 733 PetscCall(VecHIPGetArrayWrite(zz, &z)); 734 PetscCall(PetscLogGpuTimeBegin()); 735 736 switch (hipstruct->kernelchoice) { 737 #if !defined(PETSC_USE_COMPLEX) 738 case 9: 739 nblocks = 1 + (nrows - 1) / sliceheight; 740 if (blocky == 2) { 741 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 742 } else if (blocky == 4) { 743 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 744 } else if (blocky == 8) { 745 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 746 } else if (blocky == 16) { 747 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 748 } else { 749 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 750 } 751 break; 752 case 8: 753 /* each block handles approximately one slice */ 754 nchunks = hipstruct->totalchunks; 755 blocky = a->chunksize / 32; 756 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 757 nblocks = 1 + (nchunks - 1) / chunksperblock; 758 chunk_slice_map = hipstruct->chunk_slice_map; 759 if (blocky == 2) { 760 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 761 } else if (blocky == 4) { 762 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 763 } else if (blocky == 8) { 764 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 765 } else if (blocky == 16) { 766 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 767 } else { 768 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 769 } 770 break; 771 case 7: 772 nblocks = 1 + (nrows - 1) / (blocky * sliceheight); 773 if (blocky == 2) { 774 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 775 } else if (blocky == 4) { 776 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 777 } else if (blocky == 8) { 778 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 779 } else if (blocky == 16) { 780 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 781 } else { 782 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 783 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 784 } 785 break; 786 #endif 787 case 6: 788 nblocks = 1 + (nrows - 1) / (blocksize / 32); 789 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 790 break; 791 case 5: 792 nblocks = 1 + (nrows - 1) / (blocksize / 16); 793 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 794 break; 795 case 4: 796 nblocks = 1 + (nrows - 1) / (blocksize / 8); 797 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 798 break; 799 case 3: 800 nblocks = 1 + (nrows - 1) / (blocksize / 4); 801 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 802 break; 803 case 2: 804 nblocks = 1 + (nrows - 1) / (blocksize / 2); 805 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 806 break; 807 case 1: 808 nblocks = 1 + (nrows - 1) / blocksize; 809 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 810 break; 811 #if !defined(PETSC_USE_COMPLEX) 812 case 0: 813 maxoveravg = a->maxslicewidth / a->avgslicewidth; 814 if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 815 /* each block handles approximately one slice */ 816 nchunks = hipstruct->totalchunks; 817 blocky = a->chunksize / 32; 818 chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 819 nblocks = 1 + (nchunks - 1) / chunksperblock; 820 chunk_slice_map = hipstruct->chunk_slice_map; 821 if (blocky == 2) { 822 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 823 } else if (blocky == 4) { 824 matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 825 } else if (blocky == 8) { 826 matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 827 } else if (blocky == 16) { 828 matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 829 } else { 830 matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 831 } 832 } else { 833 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 834 if (avgslicesize <= 432) { 835 if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 836 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 837 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 838 } else { 839 nblocks = 1 + (nrows - 1) / sliceheight; 840 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 841 } 842 } else if (avgslicesize <= 2400) { 843 nblocks = 1 + (nrows - 1) / sliceheight; 844 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 845 } else { 846 nblocks = 1 + (nrows - 1) / sliceheight; 847 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 848 } 849 } 850 break; 851 #endif 852 default: 853 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice); 854 } 855 PetscCall(PetscLogGpuTimeEnd()); 856 PetscCall(VecHIPRestoreArrayRead(xx, &x)); 857 PetscCall(VecHIPRestoreArrayRead(yy, &y)); 858 PetscCall(VecHIPRestoreArrayWrite(zz, &z)); 859 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 860 } else { 861 PetscCall(VecCopy(yy, zz)); 862 } 863 PetscFunctionReturn(PETSC_SUCCESS); 864 } 865 866 static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems PetscOptionsObject) 867 { 868 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 869 PetscInt kernel, blocky; 870 PetscBool flg; 871 872 PetscFunctionBegin; 873 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options"); 874 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg)); 875 if (flg) { 876 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 877 hipstruct->blocky = blocky; 878 } 879 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg)); 880 if (flg) { 881 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 882 hipstruct->kernelchoice = kernel; 883 if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); 884 } 885 PetscOptionsHeadEnd(); 886 PetscFunctionReturn(PETSC_SUCCESS); 887 } 888 889 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 890 { 891 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 892 893 PetscFunctionBegin; 894 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 895 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 896 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 897 PetscFunctionReturn(PETSC_SUCCESS); 898 } 899 900 static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode) 901 { 902 PetscFunctionBegin; 903 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 904 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 905 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 906 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLHIPCopyToGPU(A)); 907 A->ops->mult = MatMult_SeqSELLHIP; 908 A->ops->multadd = MatMultAdd_SeqSELLHIP; 909 PetscFunctionReturn(PETSC_SUCCESS); 910 } 911 912 static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A) 913 { 914 PetscBool both = PETSC_FALSE; 915 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 916 917 PetscFunctionBegin; 918 if (A->factortype == MAT_FACTOR_NONE) { 919 Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 920 if (hipstruct->val) { 921 both = PETSC_TRUE; 922 PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val))); 923 } 924 } 925 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 926 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 927 else A->offloadmask = PETSC_OFFLOAD_CPU; 928 PetscFunctionReturn(PETSC_SUCCESS); 929 } 930 931 static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A) 932 { 933 PetscFunctionBegin; 934 if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr)); 935 PetscCall(MatDestroy_SeqSELL(A)); 936 PetscFunctionReturn(PETSC_SUCCESS); 937 } 938 939 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 940 static PetscErrorCode MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B) 941 { 942 PetscFunctionBegin; 943 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 944 PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B)); 945 PetscFunctionReturn(PETSC_SUCCESS); 946 } 947 948 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B) 949 { 950 Mat_SeqSELLHIP *hipstruct; 951 952 PetscFunctionBegin; 953 PetscCall(PetscFree(B->defaultvectype)); 954 PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype)); 955 956 if (!B->spptr) { 957 if (B->factortype == MAT_FACTOR_NONE) { 958 PetscCall(PetscNew(&hipstruct)); 959 B->spptr = hipstruct; 960 } 961 } 962 963 B->ops->assemblyend = MatAssemblyEnd_SeqSELLHIP; 964 B->ops->destroy = MatDestroy_SeqSELLHIP; 965 B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP; 966 B->ops->mult = MatMult_SeqSELLHIP; 967 B->ops->multadd = MatMultAdd_SeqSELLHIP; 968 B->ops->duplicate = MatDuplicate_SeqSELLHIP; 969 B->ops->zeroentries = MatZeroEntries_SeqSELLHIP; 970 971 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 972 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 973 974 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP)); 975 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 976 PetscFunctionReturn(PETSC_SUCCESS); 977 } 978 979 /*MC 980 MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs. 981 982 Options Database Keys: 983 + -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()` 984 . -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP 985 - -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs) 986 987 Level: beginner 988 989 .seealso: [](ch_matrices), `Mat`, `MATSELLHIP` 990 M*/ 991 992 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B) 993 { 994 PetscFunctionBegin; 995 PetscCall(MatCreate_SeqSELL(B)); 996 PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B)); 997 PetscCall(MatSetFromOptions(B)); 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000