1*d52a580bSJunchao Zhang #include "hip/hip_runtime.h" 2*d52a580bSJunchao Zhang #include <hip/hip_runtime.h> 3*d52a580bSJunchao Zhang 4*d52a580bSJunchao Zhang #include <petscdevice_hip.h> 5*d52a580bSJunchao Zhang #include <petsc/private/cupmatomics.hpp> 6*d52a580bSJunchao Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 7*d52a580bSJunchao Zhang 8*d52a580bSJunchao Zhang #define WARP_SIZE 64 9*d52a580bSJunchao Zhang 10*d52a580bSJunchao Zhang typedef struct { 11*d52a580bSJunchao Zhang PetscInt maxallocmat; 12*d52a580bSJunchao Zhang PetscInt totalentries; 13*d52a580bSJunchao Zhang PetscInt *colidx; /* column index array, device pointer */ 14*d52a580bSJunchao Zhang MatScalar *val; /* value array, device pointer */ 15*d52a580bSJunchao Zhang PetscInt totalslices; 16*d52a580bSJunchao Zhang PetscInt *sliidx; /* slice index array, device pointer */ 17*d52a580bSJunchao Zhang PetscInt nonzerostate; 18*d52a580bSJunchao Zhang PetscInt kernelchoice; 19*d52a580bSJunchao Zhang PetscInt blocky; 20*d52a580bSJunchao Zhang PetscInt chunksperblock; 21*d52a580bSJunchao Zhang PetscInt totalchunks; 22*d52a580bSJunchao Zhang PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */ 23*d52a580bSJunchao Zhang } Mat_SeqSELLHIP; 24*d52a580bSJunchao Zhang 25*d52a580bSJunchao Zhang static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct) 26*d52a580bSJunchao Zhang { 27*d52a580bSJunchao Zhang PetscFunctionBegin; 28*d52a580bSJunchao Zhang if (*hipstruct) { 29*d52a580bSJunchao Zhang if ((*hipstruct)->colidx) PetscCallHIP(hipFree((*hipstruct)->colidx)); 30*d52a580bSJunchao Zhang if ((*hipstruct)->val) PetscCallHIP(hipFree((*hipstruct)->val)); 31*d52a580bSJunchao Zhang if ((*hipstruct)->sliidx) PetscCallHIP(hipFree((*hipstruct)->sliidx)); 32*d52a580bSJunchao Zhang if ((*hipstruct)->chunk_slice_map) PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); 33*d52a580bSJunchao Zhang PetscCall(PetscFree(*hipstruct)); 34*d52a580bSJunchao Zhang } 35*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 36*d52a580bSJunchao Zhang } 37*d52a580bSJunchao Zhang 38*d52a580bSJunchao Zhang static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A) 39*d52a580bSJunchao Zhang { 40*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 41*d52a580bSJunchao Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 42*d52a580bSJunchao Zhang 43*d52a580bSJunchao Zhang PetscFunctionBegin; 44*d52a580bSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 45*d52a580bSJunchao Zhang PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0)); 46*d52a580bSJunchao Zhang if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) { 47*d52a580bSJunchao Zhang /* copy values only */ 48*d52a580bSJunchao Zhang PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice)); 49*d52a580bSJunchao Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 50*d52a580bSJunchao Zhang } else { 51*d52a580bSJunchao Zhang if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx)); 52*d52a580bSJunchao Zhang if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val)); 53*d52a580bSJunchao Zhang if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx)); 54*d52a580bSJunchao Zhang if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map)); 55*d52a580bSJunchao Zhang hipstruct->maxallocmat = a->maxallocmat; 56*d52a580bSJunchao Zhang hipstruct->totalentries = a->sliidx[a->totalslices]; 57*d52a580bSJunchao Zhang hipstruct->totalslices = a->totalslices; 58*d52a580bSJunchao Zhang hipstruct->totalchunks = a->totalchunks; 59*d52a580bSJunchao Zhang PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx))); 60*d52a580bSJunchao Zhang PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val))); 61*d52a580bSJunchao Zhang /* copy values, nz or maxallocmat? */ 62*d52a580bSJunchao Zhang PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice)); 63*d52a580bSJunchao Zhang PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice)); 64*d52a580bSJunchao Zhang 65*d52a580bSJunchao Zhang PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx))); 66*d52a580bSJunchao Zhang PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice)); 67*d52a580bSJunchao Zhang PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map))); 68*d52a580bSJunchao Zhang PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice)); 69*d52a580bSJunchao Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt))); 70*d52a580bSJunchao Zhang } 71*d52a580bSJunchao Zhang PetscCallHIP(WaitForHIP()); 72*d52a580bSJunchao Zhang PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0)); 73*d52a580bSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 74*d52a580bSJunchao Zhang } 75*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 76*d52a580bSJunchao Zhang } 77*d52a580bSJunchao Zhang 78*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 79*d52a580bSJunchao Zhang { 80*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 81*d52a580bSJunchao Zhang MatScalar sum; 82*d52a580bSJunchao Zhang /* one thread per row. */ 83*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 84*d52a580bSJunchao Zhang if (row < nrows) { 85*d52a580bSJunchao Zhang slice_id = row / sliceheight; 86*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 87*d52a580bSJunchao Zhang sum = 0.0; 88*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 89*d52a580bSJunchao Zhang y[row] = sum; 90*d52a580bSJunchao Zhang } 91*d52a580bSJunchao Zhang } 92*d52a580bSJunchao Zhang 93*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 94*d52a580bSJunchao Zhang { 95*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 96*d52a580bSJunchao Zhang MatScalar sum; 97*d52a580bSJunchao Zhang /* one thread per row. */ 98*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 99*d52a580bSJunchao Zhang if (row < nrows) { 100*d52a580bSJunchao Zhang slice_id = row / sliceheight; 101*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 102*d52a580bSJunchao Zhang sum = 0.0; 103*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 104*d52a580bSJunchao Zhang z[row] = y[row] + sum; 105*d52a580bSJunchao Zhang } 106*d52a580bSJunchao Zhang } 107*d52a580bSJunchao Zhang 108*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 109*d52a580bSJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed") 110*d52a580bSJunchao Zhang /* use 1 block per slice, suitable for large slice width */ 111*d52a580bSJunchao Zhang template <int BLOCKY> 112*d52a580bSJunchao Zhang __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 113*d52a580bSJunchao Zhang { 114*d52a580bSJunchao Zhang __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; 115*d52a580bSJunchao Zhang PetscInt i, row, slice_id = blockIdx.x; 116*d52a580bSJunchao Zhang int tid = threadIdx.x + threadIdx.y * WARP_SIZE; 117*d52a580bSJunchao Zhang /* transposed index */ 118*d52a580bSJunchao Zhang int tidx = tid % BLOCKY; 119*d52a580bSJunchao Zhang int tidy = tid / BLOCKY; 120*d52a580bSJunchao Zhang PetscScalar t = 0.0; 121*d52a580bSJunchao Zhang 122*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 123*d52a580bSJunchao Zhang if (row < nrows) { 124*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]]; 125*d52a580bSJunchao Zhang } 126*d52a580bSJunchao Zhang #pragma unroll 127*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 128*d52a580bSJunchao Zhang /* transpose layout to reduce each row using warp shfl */ 129*d52a580bSJunchao Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 130*d52a580bSJunchao Zhang __syncthreads(); 131*d52a580bSJunchao Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 132*d52a580bSJunchao Zhang #pragma unroll 133*d52a580bSJunchao Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 134*d52a580bSJunchao Zhang if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 135*d52a580bSJunchao Zhang __syncthreads(); 136*d52a580bSJunchao Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x]; 137*d52a580bSJunchao Zhang } 138*d52a580bSJunchao Zhang 139*d52a580bSJunchao Zhang /* use 1 block per slice, suitable for large slice width */ 140*d52a580bSJunchao Zhang template <int BLOCKY> 141*d52a580bSJunchao Zhang __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 142*d52a580bSJunchao Zhang { 143*d52a580bSJunchao Zhang __shared__ MatScalar shared[WARP_SIZE][BLOCKY]; 144*d52a580bSJunchao Zhang PetscInt i, row, slice_id = blockIdx.x; 145*d52a580bSJunchao Zhang int tid = threadIdx.x + threadIdx.y * WARP_SIZE; 146*d52a580bSJunchao Zhang /* transposed index */ 147*d52a580bSJunchao Zhang int tidx = tid % BLOCKY; 148*d52a580bSJunchao Zhang int tidy = tid / BLOCKY; 149*d52a580bSJunchao Zhang PetscScalar t = 0.0; 150*d52a580bSJunchao Zhang 151*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 152*d52a580bSJunchao Zhang if (row < nrows) { 153*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]]; 154*d52a580bSJunchao Zhang } 155*d52a580bSJunchao Zhang #pragma unroll 156*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 157*d52a580bSJunchao Zhang /* transpose layout to reduce each row using warp shfl */ 158*d52a580bSJunchao Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t; 159*d52a580bSJunchao Zhang __syncthreads(); 160*d52a580bSJunchao Zhang if (tidy < sliceheight) t = shared[tidy][tidx]; 161*d52a580bSJunchao Zhang #pragma unroll 162*d52a580bSJunchao Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 163*d52a580bSJunchao Zhang if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t; 164*d52a580bSJunchao Zhang __syncthreads(); 165*d52a580bSJunchao Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x]; 166*d52a580bSJunchao Zhang } 167*d52a580bSJunchao Zhang 168*d52a580bSJunchao Zhang template <int BLOCKY> 169*d52a580bSJunchao Zhang __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val) 170*d52a580bSJunchao Zhang { 171*d52a580bSJunchao Zhang bool head = true; 172*d52a580bSJunchao Zhang #pragma unroll 173*d52a580bSJunchao Zhang for (int i = 1; i < BLOCKY * 2; i <<= 1) { 174*d52a580bSJunchao Zhang int halfwarpid = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2); 175*d52a580bSJunchao Zhang shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0; 176*d52a580bSJunchao Zhang if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) { 177*d52a580bSJunchao Zhang shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val; 178*d52a580bSJunchao Zhang if (i == 1) head = false; 179*d52a580bSJunchao Zhang } 180*d52a580bSJunchao Zhang __syncthreads(); 181*d52a580bSJunchao Zhang if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE]; 182*d52a580bSJunchao Zhang __syncthreads(); 183*d52a580bSJunchao Zhang } 184*d52a580bSJunchao Zhang return head; 185*d52a580bSJunchao Zhang } 186*d52a580bSJunchao Zhang 187*d52a580bSJunchao Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 188*d52a580bSJunchao Zhang template <int BLOCKY> 189*d52a580bSJunchao Zhang __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 190*d52a580bSJunchao Zhang { 191*d52a580bSJunchao Zhang __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; 192*d52a580bSJunchao Zhang PetscInt gid, row, start_slice, cid; 193*d52a580bSJunchao Zhang PetscScalar t = 0.0; 194*d52a580bSJunchao Zhang AtomicAdd<MatScalar> atomAdd; 195*d52a580bSJunchao Zhang /* zero out y */ 196*d52a580bSJunchao Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) { 197*d52a580bSJunchao Zhang gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 198*d52a580bSJunchao Zhang if (gid < nrows) y[gid] = 0.0; 199*d52a580bSJunchao Zhang } 200*d52a580bSJunchao Zhang for (int iter = 0; iter < chunksperblock; iter++) { 201*d52a580bSJunchao Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 202*d52a580bSJunchao Zhang if (cid < totalchunks) { 203*d52a580bSJunchao Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 204*d52a580bSJunchao Zhang gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 205*d52a580bSJunchao Zhang if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 206*d52a580bSJunchao Zhang __shared__ PetscInt flag[BLOCKY * 2]; 207*d52a580bSJunchao Zhang bool write; 208*d52a580bSJunchao Zhang PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 209*d52a580bSJunchao Zhang /* find out the slice that this element belongs to */ 210*d52a580bSJunchao Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 211*d52a580bSJunchao Zhang if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id; 212*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 213*d52a580bSJunchao Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 214*d52a580bSJunchao Zhang __syncthreads(); 215*d52a580bSJunchao Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 216*d52a580bSJunchao Zhang if (row < nrows && gid < totalentries && write) atomAdd(y[row], t); 217*d52a580bSJunchao Zhang t = 0.0; 218*d52a580bSJunchao Zhang } else { /* this iteration covers only one slice */ 219*d52a580bSJunchao Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 220*d52a580bSJunchao Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 221*d52a580bSJunchao Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 222*d52a580bSJunchao Zhang int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 223*d52a580bSJunchao Zhang /* reduction and write to output vector */ 224*d52a580bSJunchao Zhang #pragma unroll 225*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 226*d52a580bSJunchao Zhang /* transpose layout to reduce each row using warp shfl */ 227*d52a580bSJunchao Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 228*d52a580bSJunchao Zhang __syncthreads(); 229*d52a580bSJunchao Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 230*d52a580bSJunchao Zhang #pragma unroll 231*d52a580bSJunchao Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 232*d52a580bSJunchao Zhang if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 233*d52a580bSJunchao Zhang __syncthreads(); 234*d52a580bSJunchao Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 235*d52a580bSJunchao Zhang t = 0.0; 236*d52a580bSJunchao Zhang } 237*d52a580bSJunchao Zhang } 238*d52a580bSJunchao Zhang } 239*d52a580bSJunchao Zhang } 240*d52a580bSJunchao Zhang } 241*d52a580bSJunchao Zhang 242*d52a580bSJunchao Zhang /* load-balancing version. Chunksize is equal to the number of threads per block */ 243*d52a580bSJunchao Zhang template <int BLOCKY> 244*d52a580bSJunchao Zhang __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 245*d52a580bSJunchao Zhang { 246*d52a580bSJunchao Zhang __shared__ MatScalar shared[BLOCKY * WARP_SIZE]; 247*d52a580bSJunchao Zhang PetscInt gid, row, start_slice, cid; 248*d52a580bSJunchao Zhang PetscScalar t = 0.0; 249*d52a580bSJunchao Zhang AtomicAdd<MatScalar> atomAdd; 250*d52a580bSJunchao Zhang /* copy y to z */ 251*d52a580bSJunchao Zhang for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) { 252*d52a580bSJunchao Zhang gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 253*d52a580bSJunchao Zhang if (gid < nrows) z[gid] = y[gid]; 254*d52a580bSJunchao Zhang } 255*d52a580bSJunchao Zhang for (int iter = 0; iter < chunksperblock; iter++) { 256*d52a580bSJunchao Zhang cid = blockIdx.x * chunksperblock + iter; /* chunk id */ 257*d52a580bSJunchao Zhang if (cid < totalchunks) { 258*d52a580bSJunchao Zhang start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */ 259*d52a580bSJunchao Zhang gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x; 260*d52a580bSJunchao Zhang if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */ 261*d52a580bSJunchao Zhang __shared__ PetscInt flag[BLOCKY * 2]; 262*d52a580bSJunchao Zhang bool write; 263*d52a580bSJunchao Zhang PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices]; 264*d52a580bSJunchao Zhang /* find out the slice that this element belongs to */ 265*d52a580bSJunchao Zhang while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++; 266*d52a580bSJunchao Zhang if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id; 267*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 268*d52a580bSJunchao Zhang if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]]; 269*d52a580bSJunchao Zhang __syncthreads(); 270*d52a580bSJunchao Zhang write = segment_scan<BLOCKY>(flag, shared, &t); 271*d52a580bSJunchao Zhang if (row < nrows && gid < totalentries && write) atomAdd(z[row], t); 272*d52a580bSJunchao Zhang t = 0.0; 273*d52a580bSJunchao Zhang } else { /* this iteration covers only one slice */ 274*d52a580bSJunchao Zhang row = start_slice * sliceheight + threadIdx.x % sliceheight; 275*d52a580bSJunchao Zhang if (row < nrows) t += aval[gid] * x[acolidx[gid]]; 276*d52a580bSJunchao Zhang if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */ 277*d52a580bSJunchao Zhang int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY; 278*d52a580bSJunchao Zhang /* reduction and write to output vector */ 279*d52a580bSJunchao Zhang #pragma unroll 280*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 281*d52a580bSJunchao Zhang /* transpose layout to reduce each row using warp shfl */ 282*d52a580bSJunchao Zhang if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */ 283*d52a580bSJunchao Zhang __syncthreads(); 284*d52a580bSJunchao Zhang if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */ 285*d52a580bSJunchao Zhang #pragma unroll 286*d52a580bSJunchao Zhang for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down(t, offset, BLOCKY); 287*d52a580bSJunchao Zhang if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */ 288*d52a580bSJunchao Zhang __syncthreads(); 289*d52a580bSJunchao Zhang if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */ 290*d52a580bSJunchao Zhang t = 0.0; 291*d52a580bSJunchao Zhang } 292*d52a580bSJunchao Zhang } 293*d52a580bSJunchao Zhang } 294*d52a580bSJunchao Zhang } 295*d52a580bSJunchao Zhang } 296*d52a580bSJunchao Zhang 297*d52a580bSJunchao Zhang /* use 1 warp per slice, suitable for small slice width */ 298*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 299*d52a580bSJunchao Zhang { 300*d52a580bSJunchao Zhang PetscInt i, row, slice_id; 301*d52a580bSJunchao Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 302*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 303*d52a580bSJunchao Zhang double t = 0.0; 304*d52a580bSJunchao Zhang if (row < nrows) { 305*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]]; 306*d52a580bSJunchao Zhang } 307*d52a580bSJunchao Zhang #pragma unroll 308*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 309*d52a580bSJunchao Zhang if (row < nrows && threadIdx.x < sliceheight) y[row] = t; 310*d52a580bSJunchao Zhang } 311*d52a580bSJunchao Zhang 312*d52a580bSJunchao Zhang /* use 1 warp per slice, suitable for small slice width */ 313*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 314*d52a580bSJunchao Zhang { 315*d52a580bSJunchao Zhang PetscInt i, row, slice_id; 316*d52a580bSJunchao Zhang slice_id = blockIdx.x * blockDim.y + threadIdx.y; 317*d52a580bSJunchao Zhang row = slice_id * sliceheight + threadIdx.x % sliceheight; 318*d52a580bSJunchao Zhang double t = 0.0; 319*d52a580bSJunchao Zhang if (row < nrows) { 320*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]]; 321*d52a580bSJunchao Zhang } 322*d52a580bSJunchao Zhang #pragma unroll 323*d52a580bSJunchao Zhang for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) t += __shfl_down(t, offset); 324*d52a580bSJunchao Zhang if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t; 325*d52a580bSJunchao Zhang } 326*d52a580bSJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 327*d52a580bSJunchao Zhang #endif 328*d52a580bSJunchao Zhang 329*d52a580bSJunchao Zhang /*********** Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison **********/ 330*d52a580bSJunchao Zhang 331*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 332*d52a580bSJunchao Zhang { 333*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 334*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 335*d52a580bSJunchao Zhang /* multiple threads per row. */ 336*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 337*d52a580bSJunchao Zhang if (row < nrows) { 338*d52a580bSJunchao Zhang slice_id = row / sliceheight; 339*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 340*d52a580bSJunchao Zhang 341*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 342*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 343*d52a580bSJunchao Zhang __syncthreads(); 344*d52a580bSJunchao Zhang if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 345*d52a580bSJunchao Zhang __syncthreads(); 346*d52a580bSJunchao Zhang if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 347*d52a580bSJunchao Zhang __syncthreads(); 348*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 349*d52a580bSJunchao Zhang __syncthreads(); 350*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 351*d52a580bSJunchao Zhang __syncthreads(); 352*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 353*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 354*d52a580bSJunchao Zhang y[row] = shared[threadIdx.x]; 355*d52a580bSJunchao Zhang } 356*d52a580bSJunchao Zhang } 357*d52a580bSJunchao Zhang } 358*d52a580bSJunchao Zhang 359*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 360*d52a580bSJunchao Zhang { 361*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 362*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 363*d52a580bSJunchao Zhang /* multiple threads per row. */ 364*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 365*d52a580bSJunchao Zhang if (row < nrows) { 366*d52a580bSJunchao Zhang slice_id = row / sliceheight; 367*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 368*d52a580bSJunchao Zhang 369*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 370*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 371*d52a580bSJunchao Zhang __syncthreads(); 372*d52a580bSJunchao Zhang if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 373*d52a580bSJunchao Zhang __syncthreads(); 374*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 375*d52a580bSJunchao Zhang __syncthreads(); 376*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 377*d52a580bSJunchao Zhang __syncthreads(); 378*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 379*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 380*d52a580bSJunchao Zhang y[row] = shared[threadIdx.x]; 381*d52a580bSJunchao Zhang } 382*d52a580bSJunchao Zhang } 383*d52a580bSJunchao Zhang } 384*d52a580bSJunchao Zhang 385*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 386*d52a580bSJunchao Zhang { 387*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 388*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 389*d52a580bSJunchao Zhang /* multiple threads per row. */ 390*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 391*d52a580bSJunchao Zhang if (row < nrows) { 392*d52a580bSJunchao Zhang slice_id = row / sliceheight; 393*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 394*d52a580bSJunchao Zhang 395*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 396*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 397*d52a580bSJunchao Zhang __syncthreads(); 398*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 399*d52a580bSJunchao Zhang __syncthreads(); 400*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 401*d52a580bSJunchao Zhang __syncthreads(); 402*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 403*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 404*d52a580bSJunchao Zhang y[row] = shared[threadIdx.x]; 405*d52a580bSJunchao Zhang } 406*d52a580bSJunchao Zhang } 407*d52a580bSJunchao Zhang } 408*d52a580bSJunchao Zhang 409*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 410*d52a580bSJunchao Zhang { 411*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 412*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 413*d52a580bSJunchao Zhang /* multiple threads per row. */ 414*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 415*d52a580bSJunchao Zhang if (row < nrows) { 416*d52a580bSJunchao Zhang slice_id = row / sliceheight; 417*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 418*d52a580bSJunchao Zhang 419*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 420*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 421*d52a580bSJunchao Zhang __syncthreads(); 422*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 423*d52a580bSJunchao Zhang __syncthreads(); 424*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 425*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 426*d52a580bSJunchao Zhang y[row] = shared[threadIdx.x]; 427*d52a580bSJunchao Zhang } 428*d52a580bSJunchao Zhang } 429*d52a580bSJunchao Zhang } 430*d52a580bSJunchao Zhang 431*d52a580bSJunchao Zhang static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 432*d52a580bSJunchao Zhang { 433*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 434*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 435*d52a580bSJunchao Zhang /* multiple threads per row. */ 436*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 437*d52a580bSJunchao Zhang if (row < nrows) { 438*d52a580bSJunchao Zhang slice_id = row / sliceheight; 439*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 440*d52a580bSJunchao Zhang 441*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 442*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 443*d52a580bSJunchao Zhang __syncthreads(); 444*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 445*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 446*d52a580bSJunchao Zhang y[row] = shared[threadIdx.x]; 447*d52a580bSJunchao Zhang } 448*d52a580bSJunchao Zhang } 449*d52a580bSJunchao Zhang } 450*d52a580bSJunchao Zhang 451*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 452*d52a580bSJunchao Zhang { 453*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 454*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 455*d52a580bSJunchao Zhang /* multiple threads per row. */ 456*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 457*d52a580bSJunchao Zhang if (row < nrows) { 458*d52a580bSJunchao Zhang slice_id = row / sliceheight; 459*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 460*d52a580bSJunchao Zhang 461*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 462*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 463*d52a580bSJunchao Zhang __syncthreads(); 464*d52a580bSJunchao Zhang if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; 465*d52a580bSJunchao Zhang __syncthreads(); 466*d52a580bSJunchao Zhang if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 467*d52a580bSJunchao Zhang __syncthreads(); 468*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 469*d52a580bSJunchao Zhang __syncthreads(); 470*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 471*d52a580bSJunchao Zhang __syncthreads(); 472*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 473*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 474*d52a580bSJunchao Zhang z[row] = y[row] + shared[threadIdx.x]; 475*d52a580bSJunchao Zhang } 476*d52a580bSJunchao Zhang } 477*d52a580bSJunchao Zhang } 478*d52a580bSJunchao Zhang 479*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 480*d52a580bSJunchao Zhang { 481*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 482*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 483*d52a580bSJunchao Zhang /* multiple threads per row. */ 484*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 485*d52a580bSJunchao Zhang if (row < nrows) { 486*d52a580bSJunchao Zhang slice_id = row / sliceheight; 487*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 488*d52a580bSJunchao Zhang 489*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 490*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 491*d52a580bSJunchao Zhang __syncthreads(); 492*d52a580bSJunchao Zhang if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; 493*d52a580bSJunchao Zhang __syncthreads(); 494*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 495*d52a580bSJunchao Zhang __syncthreads(); 496*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 497*d52a580bSJunchao Zhang __syncthreads(); 498*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 499*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 500*d52a580bSJunchao Zhang z[row] = y[row] + shared[threadIdx.x]; 501*d52a580bSJunchao Zhang } 502*d52a580bSJunchao Zhang } 503*d52a580bSJunchao Zhang } 504*d52a580bSJunchao Zhang 505*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 506*d52a580bSJunchao Zhang { 507*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 508*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 509*d52a580bSJunchao Zhang /* multiple threads per row. */ 510*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 511*d52a580bSJunchao Zhang if (row < nrows) { 512*d52a580bSJunchao Zhang slice_id = row / sliceheight; 513*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 514*d52a580bSJunchao Zhang 515*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 516*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 517*d52a580bSJunchao Zhang __syncthreads(); 518*d52a580bSJunchao Zhang if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; 519*d52a580bSJunchao Zhang __syncthreads(); 520*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 521*d52a580bSJunchao Zhang __syncthreads(); 522*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 523*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 524*d52a580bSJunchao Zhang z[row] = y[row] + shared[threadIdx.x]; 525*d52a580bSJunchao Zhang } 526*d52a580bSJunchao Zhang } 527*d52a580bSJunchao Zhang } 528*d52a580bSJunchao Zhang 529*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 530*d52a580bSJunchao Zhang { 531*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 532*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 533*d52a580bSJunchao Zhang /* multiple threads per row. */ 534*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 535*d52a580bSJunchao Zhang if (row < nrows) { 536*d52a580bSJunchao Zhang slice_id = row / sliceheight; 537*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 538*d52a580bSJunchao Zhang 539*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 540*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 541*d52a580bSJunchao Zhang __syncthreads(); 542*d52a580bSJunchao Zhang if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; 543*d52a580bSJunchao Zhang __syncthreads(); 544*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 545*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 546*d52a580bSJunchao Zhang z[row] = y[row] + shared[threadIdx.x]; 547*d52a580bSJunchao Zhang } 548*d52a580bSJunchao Zhang } 549*d52a580bSJunchao Zhang } 550*d52a580bSJunchao Zhang 551*d52a580bSJunchao Zhang static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 552*d52a580bSJunchao Zhang { 553*d52a580bSJunchao Zhang __shared__ MatScalar shared[32 * 16]; 554*d52a580bSJunchao Zhang PetscInt i, row, slice_id, row_in_slice; 555*d52a580bSJunchao Zhang /* multiple threads per row. */ 556*d52a580bSJunchao Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 557*d52a580bSJunchao Zhang if (row < nrows) { 558*d52a580bSJunchao Zhang slice_id = row / sliceheight; 559*d52a580bSJunchao Zhang row_in_slice = row % sliceheight; 560*d52a580bSJunchao Zhang 561*d52a580bSJunchao Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 562*d52a580bSJunchao Zhang for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 563*d52a580bSJunchao Zhang __syncthreads(); 564*d52a580bSJunchao Zhang if (threadIdx.y < 1) { 565*d52a580bSJunchao Zhang shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 566*d52a580bSJunchao Zhang z[row] = y[row] + shared[threadIdx.x]; 567*d52a580bSJunchao Zhang } 568*d52a580bSJunchao Zhang } 569*d52a580bSJunchao Zhang } 570*d52a580bSJunchao Zhang 571*d52a580bSJunchao Zhang static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy) 572*d52a580bSJunchao Zhang { 573*d52a580bSJunchao Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 574*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 575*d52a580bSJunchao Zhang PetscScalar *y; 576*d52a580bSJunchao Zhang const PetscScalar *x; 577*d52a580bSJunchao Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 578*d52a580bSJunchao Zhang MatScalar *aval; 579*d52a580bSJunchao Zhang PetscInt *acolidx; 580*d52a580bSJunchao Zhang PetscInt *sliidx; 581*d52a580bSJunchao Zhang PetscInt nblocks, blocksize = 512; /* blocksize is fixed to be 512 */ 582*d52a580bSJunchao Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 583*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 584*d52a580bSJunchao Zhang PetscInt chunksperblock, nchunks, *chunk_slice_map; 585*d52a580bSJunchao Zhang PetscReal maxoveravg; 586*d52a580bSJunchao Zhang #endif 587*d52a580bSJunchao Zhang 588*d52a580bSJunchao Zhang PetscFunctionBegin; 589*d52a580bSJunchao Zhang PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 590*d52a580bSJunchao Zhang PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight); 591*d52a580bSJunchao Zhang PetscCall(MatSeqSELLHIPCopyToGPU(A)); 592*d52a580bSJunchao Zhang /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */ 593*d52a580bSJunchao Zhang aval = hipstruct->val; 594*d52a580bSJunchao Zhang acolidx = hipstruct->colidx; 595*d52a580bSJunchao Zhang sliidx = hipstruct->sliidx; 596*d52a580bSJunchao Zhang 597*d52a580bSJunchao Zhang PetscCall(VecHIPGetArrayRead(xx, &x)); 598*d52a580bSJunchao Zhang PetscCall(VecHIPGetArrayWrite(yy, &y)); 599*d52a580bSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 600*d52a580bSJunchao Zhang 601*d52a580bSJunchao Zhang switch (hipstruct->kernelchoice) { 602*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 603*d52a580bSJunchao Zhang case 9: /* 1 slice per block */ 604*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 605*d52a580bSJunchao Zhang if (hipstruct->blocky == 2) { 606*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 607*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 4) { 608*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 609*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 8) { 610*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 611*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 16) { 612*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 613*d52a580bSJunchao Zhang } else { 614*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 615*d52a580bSJunchao Zhang } 616*d52a580bSJunchao Zhang break; 617*d52a580bSJunchao Zhang case 7: /* each block handles blocky slices */ 618*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight); 619*d52a580bSJunchao Zhang if (hipstruct->blocky == 2) { 620*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 621*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 4) { 622*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 623*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 8) { 624*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 625*d52a580bSJunchao Zhang } else if (hipstruct->blocky == 16) { 626*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 627*d52a580bSJunchao Zhang } else { 628*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 629*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 630*d52a580bSJunchao Zhang } 631*d52a580bSJunchao Zhang break; 632*d52a580bSJunchao Zhang #endif 633*d52a580bSJunchao Zhang case 6: 634*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */ 635*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 636*d52a580bSJunchao Zhang break; 637*d52a580bSJunchao Zhang case 5: 638*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/ 639*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 640*d52a580bSJunchao Zhang break; 641*d52a580bSJunchao Zhang case 4: 642*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */ 643*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 644*d52a580bSJunchao Zhang break; 645*d52a580bSJunchao Zhang case 3: 646*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */ 647*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 648*d52a580bSJunchao Zhang break; 649*d52a580bSJunchao Zhang case 2: /* 16 slices per block if sliceheight=32 */ 650*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 651*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 652*d52a580bSJunchao Zhang break; 653*d52a580bSJunchao Zhang case 1: /* 32 slices per block if sliceheight=32 */ 654*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / blocksize; 655*d52a580bSJunchao Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 656*d52a580bSJunchao Zhang break; 657*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 658*d52a580bSJunchao Zhang case 0: 659*d52a580bSJunchao Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 660*d52a580bSJunchao Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 661*d52a580bSJunchao Zhang /* each block handles approximately one slice */ 662*d52a580bSJunchao Zhang PetscInt blocky = a->chunksize / 32; 663*d52a580bSJunchao Zhang nchunks = hipstruct->totalchunks; 664*d52a580bSJunchao Zhang chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 665*d52a580bSJunchao Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 666*d52a580bSJunchao Zhang chunk_slice_map = hipstruct->chunk_slice_map; 667*d52a580bSJunchao Zhang if (blocky == 2) { 668*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 669*d52a580bSJunchao Zhang } else if (blocky == 4) { 670*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 671*d52a580bSJunchao Zhang } else if (blocky == 8) { 672*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 673*d52a580bSJunchao Zhang } else if (blocky == 16) { 674*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 675*d52a580bSJunchao Zhang } else { 676*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y); 677*d52a580bSJunchao Zhang } 678*d52a580bSJunchao Zhang } else { 679*d52a580bSJunchao Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 680*d52a580bSJunchao Zhang if (avgslicesize <= 432) { 681*d52a580bSJunchao Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 682*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 683*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 684*d52a580bSJunchao Zhang } else { 685*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 686*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 687*d52a580bSJunchao Zhang } 688*d52a580bSJunchao Zhang } else if (avgslicesize <= 2400) { 689*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 690*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 691*d52a580bSJunchao Zhang } else { 692*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 693*d52a580bSJunchao Zhang matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 694*d52a580bSJunchao Zhang } 695*d52a580bSJunchao Zhang } 696*d52a580bSJunchao Zhang break; 697*d52a580bSJunchao Zhang #endif 698*d52a580bSJunchao Zhang default: 699*d52a580bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice); 700*d52a580bSJunchao Zhang } 701*d52a580bSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 702*d52a580bSJunchao Zhang PetscCall(VecHIPRestoreArrayRead(xx, &x)); 703*d52a580bSJunchao Zhang PetscCall(VecHIPRestoreArrayWrite(yy, &y)); 704*d52a580bSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 705*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 706*d52a580bSJunchao Zhang } 707*d52a580bSJunchao Zhang 708*d52a580bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz) 709*d52a580bSJunchao Zhang { 710*d52a580bSJunchao Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 711*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 712*d52a580bSJunchao Zhang PetscScalar *z; 713*d52a580bSJunchao Zhang const PetscScalar *y, *x; 714*d52a580bSJunchao Zhang PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 715*d52a580bSJunchao Zhang MatScalar *aval = hipstruct->val; 716*d52a580bSJunchao Zhang PetscInt *acolidx = hipstruct->colidx; 717*d52a580bSJunchao Zhang PetscInt *sliidx = hipstruct->sliidx; 718*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 719*d52a580bSJunchao Zhang PetscReal maxoveravg; 720*d52a580bSJunchao Zhang PetscInt chunksperblock, nchunks, *chunk_slice_map; 721*d52a580bSJunchao Zhang PetscInt blocky = hipstruct->blocky; 722*d52a580bSJunchao Zhang #endif 723*d52a580bSJunchao Zhang 724*d52a580bSJunchao Zhang PetscFunctionBegin; 725*d52a580bSJunchao Zhang PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 726*d52a580bSJunchao Zhang PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight); 727*d52a580bSJunchao Zhang PetscCall(MatSeqSELLHIPCopyToGPU(A)); 728*d52a580bSJunchao Zhang if (a->nz) { 729*d52a580bSJunchao Zhang PetscInt nblocks, blocksize = 512; 730*d52a580bSJunchao Zhang dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 731*d52a580bSJunchao Zhang PetscCall(VecHIPGetArrayRead(xx, &x)); 732*d52a580bSJunchao Zhang PetscCall(VecHIPGetArrayRead(yy, &y)); 733*d52a580bSJunchao Zhang PetscCall(VecHIPGetArrayWrite(zz, &z)); 734*d52a580bSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 735*d52a580bSJunchao Zhang 736*d52a580bSJunchao Zhang switch (hipstruct->kernelchoice) { 737*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 738*d52a580bSJunchao Zhang case 9: 739*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 740*d52a580bSJunchao Zhang if (blocky == 2) { 741*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 742*d52a580bSJunchao Zhang } else if (blocky == 4) { 743*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 744*d52a580bSJunchao Zhang } else if (blocky == 8) { 745*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 746*d52a580bSJunchao Zhang } else if (blocky == 16) { 747*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 748*d52a580bSJunchao Zhang } else { 749*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 750*d52a580bSJunchao Zhang } 751*d52a580bSJunchao Zhang break; 752*d52a580bSJunchao Zhang case 8: 753*d52a580bSJunchao Zhang /* each block handles approximately one slice */ 754*d52a580bSJunchao Zhang nchunks = hipstruct->totalchunks; 755*d52a580bSJunchao Zhang blocky = a->chunksize / 32; 756*d52a580bSJunchao Zhang chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 757*d52a580bSJunchao Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 758*d52a580bSJunchao Zhang chunk_slice_map = hipstruct->chunk_slice_map; 759*d52a580bSJunchao Zhang if (blocky == 2) { 760*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 761*d52a580bSJunchao Zhang } else if (blocky == 4) { 762*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 763*d52a580bSJunchao Zhang } else if (blocky == 8) { 764*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 765*d52a580bSJunchao Zhang } else if (blocky == 16) { 766*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 767*d52a580bSJunchao Zhang } else { 768*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 769*d52a580bSJunchao Zhang } 770*d52a580bSJunchao Zhang break; 771*d52a580bSJunchao Zhang case 7: 772*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocky * sliceheight); 773*d52a580bSJunchao Zhang if (blocky == 2) { 774*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 775*d52a580bSJunchao Zhang } else if (blocky == 4) { 776*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 777*d52a580bSJunchao Zhang } else if (blocky == 8) { 778*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 779*d52a580bSJunchao Zhang } else if (blocky == 16) { 780*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 781*d52a580bSJunchao Zhang } else { 782*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); 783*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 784*d52a580bSJunchao Zhang } 785*d52a580bSJunchao Zhang break; 786*d52a580bSJunchao Zhang #endif 787*d52a580bSJunchao Zhang case 6: 788*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 32); 789*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 790*d52a580bSJunchao Zhang break; 791*d52a580bSJunchao Zhang case 5: 792*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 16); 793*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 794*d52a580bSJunchao Zhang break; 795*d52a580bSJunchao Zhang case 4: 796*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 8); 797*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 798*d52a580bSJunchao Zhang break; 799*d52a580bSJunchao Zhang case 3: 800*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 4); 801*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 802*d52a580bSJunchao Zhang break; 803*d52a580bSJunchao Zhang case 2: 804*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (blocksize / 2); 805*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 806*d52a580bSJunchao Zhang break; 807*d52a580bSJunchao Zhang case 1: 808*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / blocksize; 809*d52a580bSJunchao Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 810*d52a580bSJunchao Zhang break; 811*d52a580bSJunchao Zhang #if !defined(PETSC_USE_COMPLEX) 812*d52a580bSJunchao Zhang case 0: 813*d52a580bSJunchao Zhang maxoveravg = a->maxslicewidth / a->avgslicewidth; 814*d52a580bSJunchao Zhang if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */ 815*d52a580bSJunchao Zhang /* each block handles approximately one slice */ 816*d52a580bSJunchao Zhang nchunks = hipstruct->totalchunks; 817*d52a580bSJunchao Zhang blocky = a->chunksize / 32; 818*d52a580bSJunchao Zhang chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize; 819*d52a580bSJunchao Zhang nblocks = 1 + (nchunks - 1) / chunksperblock; 820*d52a580bSJunchao Zhang chunk_slice_map = hipstruct->chunk_slice_map; 821*d52a580bSJunchao Zhang if (blocky == 2) { 822*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 823*d52a580bSJunchao Zhang } else if (blocky == 4) { 824*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 825*d52a580bSJunchao Zhang } else if (blocky == 8) { 826*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 827*d52a580bSJunchao Zhang } else if (blocky == 16) { 828*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 829*d52a580bSJunchao Zhang } else { 830*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z); 831*d52a580bSJunchao Zhang } 832*d52a580bSJunchao Zhang } else { 833*d52a580bSJunchao Zhang PetscInt avgslicesize = sliceheight * a->avgslicewidth; 834*d52a580bSJunchao Zhang if (avgslicesize <= 432) { 835*d52a580bSJunchao Zhang if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) { 836*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 837*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 838*d52a580bSJunchao Zhang } else { 839*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 840*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 841*d52a580bSJunchao Zhang } 842*d52a580bSJunchao Zhang } else if (avgslicesize <= 2400) { 843*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 844*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 845*d52a580bSJunchao Zhang } else { 846*d52a580bSJunchao Zhang nblocks = 1 + (nrows - 1) / sliceheight; 847*d52a580bSJunchao Zhang matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 848*d52a580bSJunchao Zhang } 849*d52a580bSJunchao Zhang } 850*d52a580bSJunchao Zhang break; 851*d52a580bSJunchao Zhang #endif 852*d52a580bSJunchao Zhang default: 853*d52a580bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice); 854*d52a580bSJunchao Zhang } 855*d52a580bSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 856*d52a580bSJunchao Zhang PetscCall(VecHIPRestoreArrayRead(xx, &x)); 857*d52a580bSJunchao Zhang PetscCall(VecHIPRestoreArrayRead(yy, &y)); 858*d52a580bSJunchao Zhang PetscCall(VecHIPRestoreArrayWrite(zz, &z)); 859*d52a580bSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 860*d52a580bSJunchao Zhang } else { 861*d52a580bSJunchao Zhang PetscCall(VecCopy(yy, zz)); 862*d52a580bSJunchao Zhang } 863*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 864*d52a580bSJunchao Zhang } 865*d52a580bSJunchao Zhang 866*d52a580bSJunchao Zhang static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems PetscOptionsObject) 867*d52a580bSJunchao Zhang { 868*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 869*d52a580bSJunchao Zhang PetscInt kernel, blocky; 870*d52a580bSJunchao Zhang PetscBool flg; 871*d52a580bSJunchao Zhang 872*d52a580bSJunchao Zhang PetscFunctionBegin; 873*d52a580bSJunchao Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options"); 874*d52a580bSJunchao Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg)); 875*d52a580bSJunchao Zhang if (flg) { 876*d52a580bSJunchao Zhang PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky); 877*d52a580bSJunchao Zhang hipstruct->blocky = blocky; 878*d52a580bSJunchao Zhang } 879*d52a580bSJunchao Zhang PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg)); 880*d52a580bSJunchao Zhang if (flg) { 881*d52a580bSJunchao Zhang PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 882*d52a580bSJunchao Zhang hipstruct->kernelchoice = kernel; 883*d52a580bSJunchao Zhang if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); 884*d52a580bSJunchao Zhang } 885*d52a580bSJunchao Zhang PetscOptionsHeadEnd(); 886*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 887*d52a580bSJunchao Zhang } 888*d52a580bSJunchao Zhang 889*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 890*d52a580bSJunchao Zhang { 891*d52a580bSJunchao Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 892*d52a580bSJunchao Zhang 893*d52a580bSJunchao Zhang PetscFunctionBegin; 894*d52a580bSJunchao Zhang PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 895*d52a580bSJunchao Zhang PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 896*d52a580bSJunchao Zhang PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 897*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 898*d52a580bSJunchao Zhang } 899*d52a580bSJunchao Zhang 900*d52a580bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode) 901*d52a580bSJunchao Zhang { 902*d52a580bSJunchao Zhang PetscFunctionBegin; 903*d52a580bSJunchao Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 904*d52a580bSJunchao Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 905*d52a580bSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 906*d52a580bSJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLHIPCopyToGPU(A)); 907*d52a580bSJunchao Zhang A->ops->mult = MatMult_SeqSELLHIP; 908*d52a580bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqSELLHIP; 909*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 910*d52a580bSJunchao Zhang } 911*d52a580bSJunchao Zhang 912*d52a580bSJunchao Zhang static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A) 913*d52a580bSJunchao Zhang { 914*d52a580bSJunchao Zhang PetscBool both = PETSC_FALSE; 915*d52a580bSJunchao Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 916*d52a580bSJunchao Zhang 917*d52a580bSJunchao Zhang PetscFunctionBegin; 918*d52a580bSJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 919*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr; 920*d52a580bSJunchao Zhang if (hipstruct->val) { 921*d52a580bSJunchao Zhang both = PETSC_TRUE; 922*d52a580bSJunchao Zhang PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val))); 923*d52a580bSJunchao Zhang } 924*d52a580bSJunchao Zhang } 925*d52a580bSJunchao Zhang PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 926*d52a580bSJunchao Zhang if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 927*d52a580bSJunchao Zhang else A->offloadmask = PETSC_OFFLOAD_CPU; 928*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 929*d52a580bSJunchao Zhang } 930*d52a580bSJunchao Zhang 931*d52a580bSJunchao Zhang static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A) 932*d52a580bSJunchao Zhang { 933*d52a580bSJunchao Zhang PetscFunctionBegin; 934*d52a580bSJunchao Zhang if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr)); 935*d52a580bSJunchao Zhang PetscCall(MatDestroy_SeqSELL(A)); 936*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 937*d52a580bSJunchao Zhang } 938*d52a580bSJunchao Zhang 939*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 940*d52a580bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B) 941*d52a580bSJunchao Zhang { 942*d52a580bSJunchao Zhang PetscFunctionBegin; 943*d52a580bSJunchao Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 944*d52a580bSJunchao Zhang PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B)); 945*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 946*d52a580bSJunchao Zhang } 947*d52a580bSJunchao Zhang 948*d52a580bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B) 949*d52a580bSJunchao Zhang { 950*d52a580bSJunchao Zhang Mat_SeqSELLHIP *hipstruct; 951*d52a580bSJunchao Zhang 952*d52a580bSJunchao Zhang PetscFunctionBegin; 953*d52a580bSJunchao Zhang PetscCall(PetscFree(B->defaultvectype)); 954*d52a580bSJunchao Zhang PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype)); 955*d52a580bSJunchao Zhang 956*d52a580bSJunchao Zhang if (!B->spptr) { 957*d52a580bSJunchao Zhang if (B->factortype == MAT_FACTOR_NONE) { 958*d52a580bSJunchao Zhang PetscCall(PetscNew(&hipstruct)); 959*d52a580bSJunchao Zhang B->spptr = hipstruct; 960*d52a580bSJunchao Zhang } 961*d52a580bSJunchao Zhang } 962*d52a580bSJunchao Zhang 963*d52a580bSJunchao Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLHIP; 964*d52a580bSJunchao Zhang B->ops->destroy = MatDestroy_SeqSELLHIP; 965*d52a580bSJunchao Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP; 966*d52a580bSJunchao Zhang B->ops->mult = MatMult_SeqSELLHIP; 967*d52a580bSJunchao Zhang B->ops->multadd = MatMultAdd_SeqSELLHIP; 968*d52a580bSJunchao Zhang B->ops->duplicate = MatDuplicate_SeqSELLHIP; 969*d52a580bSJunchao Zhang B->ops->zeroentries = MatZeroEntries_SeqSELLHIP; 970*d52a580bSJunchao Zhang 971*d52a580bSJunchao Zhang /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 972*d52a580bSJunchao Zhang PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 973*d52a580bSJunchao Zhang 974*d52a580bSJunchao Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP)); 975*d52a580bSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 976*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 977*d52a580bSJunchao Zhang } 978*d52a580bSJunchao Zhang 979*d52a580bSJunchao Zhang /*MC 980*d52a580bSJunchao Zhang MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs. 981*d52a580bSJunchao Zhang 982*d52a580bSJunchao Zhang Options Database Keys: 983*d52a580bSJunchao Zhang + -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()` 984*d52a580bSJunchao Zhang . -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP 985*d52a580bSJunchao Zhang - -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs) 986*d52a580bSJunchao Zhang 987*d52a580bSJunchao Zhang Level: beginner 988*d52a580bSJunchao Zhang 989*d52a580bSJunchao Zhang .seealso: [](ch_matrices), `Mat`, `MATSELLHIP` 990*d52a580bSJunchao Zhang M*/ 991*d52a580bSJunchao Zhang 992*d52a580bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B) 993*d52a580bSJunchao Zhang { 994*d52a580bSJunchao Zhang PetscFunctionBegin; 995*d52a580bSJunchao Zhang PetscCall(MatCreate_SeqSELL(B)); 996*d52a580bSJunchao Zhang PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B)); 997*d52a580bSJunchao Zhang PetscCall(MatSetFromOptions(B)); 998*d52a580bSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 999*d52a580bSJunchao Zhang } 1000