1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 #define SLICE_HEIGHT 16 7 8 typedef struct { 9 PetscInt *colidx; /* column index */ 10 MatScalar *val; 11 PetscInt *sliidx; 12 PetscInt nonzerostate; 13 PetscInt kernelchoice; 14 PetscInt blocky; 15 } Mat_SeqSELLCUDA; 16 17 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 18 { 19 PetscFunctionBegin; 20 if (*cudastruct) { 21 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 22 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 23 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 24 PetscCall(PetscFree(*cudastruct)); 25 } 26 PetscFunctionReturn(PETSC_SUCCESS); 27 } 28 29 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 30 { 31 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 32 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 33 34 PetscFunctionBegin; 35 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 36 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 37 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 38 /* copy values only */ 39 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 40 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 41 } else { 42 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 43 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 44 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 45 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 46 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 47 /* copy values, nz or maxallocmat? */ 48 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 49 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 50 51 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 52 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 53 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 54 cudastruct->nonzerostate = A->nonzerostate; 55 } 56 PetscCallCUDA(WaitForCUDA()); 57 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 58 A->offloadmask = PETSC_OFFLOAD_BOTH; 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 64 { 65 PetscInt i, row, slice_id, row_in_slice; 66 MatScalar sum; 67 /* one thread per row. */ 68 row = blockIdx.x * blockDim.x + threadIdx.x; 69 if (row < nrows) { 70 slice_id = row / sliceheight; 71 row_in_slice = row % sliceheight; 72 sum = 0.0; 73 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 74 y[row] = sum; 75 } 76 } 77 78 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 79 { 80 PetscInt i, row, slice_id, row_in_slice; 81 MatScalar sum; 82 /* one thread per row. */ 83 row = blockIdx.x * blockDim.x + threadIdx.x; 84 if (row < nrows) { 85 slice_id = row / sliceheight; 86 row_in_slice = row % sliceheight; 87 sum = 0.0; 88 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]]; 89 z[row] = y[row] + sum; 90 } 91 } 92 93 /* use 1 block per slice, suitable for large slice width */ 94 template <int BLOCKY> 95 __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 96 { 97 __shared__ MatScalar shared[32][BLOCKY]; 98 PetscInt i, row, slice_id = blockIdx.x; 99 int tid = threadIdx.x + threadIdx.y * 32; 100 /* transposed index */ 101 int tidx = tid % BLOCKY; 102 int tidy = tid / BLOCKY; 103 PetscScalar t = 0.0; 104 105 row = slice_id * sliceheight + threadIdx.x % sliceheight; 106 if (row < nrows) { 107 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 108 } 109 #pragma unroll 110 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 111 /* transpose layout to reduce each row using warp shfl */ 112 shared[threadIdx.x][threadIdx.y] = t; 113 __syncthreads(); 114 t = shared[tidy][tidx]; 115 #pragma unroll 116 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 117 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 118 __syncthreads(); 119 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; } 120 } 121 122 /* use 1 block per slice, suitable for large slice width */ 123 template <int BLOCKY> 124 __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 125 { 126 __shared__ MatScalar shared[32][BLOCKY]; 127 PetscInt i, row, slice_id = blockIdx.x; 128 int tid = threadIdx.x + threadIdx.y * 32; 129 /* transposed index */ 130 int tidx = tid % BLOCKY; 131 int tidy = tid / BLOCKY; 132 PetscScalar t = 0.0; 133 134 row = slice_id * sliceheight + threadIdx.x % sliceheight; 135 if (row < nrows) { 136 for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]]; 137 } 138 #pragma unroll 139 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 140 /* transpose layout to reduce each row using warp shfl */ 141 shared[threadIdx.x][threadIdx.y] = t; 142 __syncthreads(); 143 t = shared[tidy][tidx]; 144 #pragma unroll 145 for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); } 146 if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; } 147 __syncthreads(); 148 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; } 149 } 150 151 /* use 1 warp per slice, suitable for small slice width */ 152 __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 153 { 154 PetscInt i, row, slice_id; 155 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 156 row = slice_id * sliceheight + threadIdx.x % sliceheight; 157 double t = 0.0; 158 if (row < nrows) { 159 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 160 } 161 #pragma unroll 162 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 163 if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; } 164 } 165 166 /* use 1 warp per slice, suitable for small slice width */ 167 __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 168 { 169 PetscInt i, row, slice_id; 170 slice_id = blockIdx.x * blockDim.y + threadIdx.y; 171 row = slice_id * sliceheight + threadIdx.x % sliceheight; 172 double t = 0.0; 173 if (row < nrows) { 174 for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]]; 175 } 176 #pragma unroll 177 for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); } 178 if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; } 179 } 180 181 /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/ 182 183 __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 184 { 185 __shared__ MatScalar shared[512]; 186 PetscInt i, row, slice_id, row_in_slice; 187 /* multiple threads per row. */ 188 row = blockIdx.x * blockDim.x + threadIdx.x; 189 if (row < nrows) { 190 slice_id = row / SLICE_HEIGHT; 191 row_in_slice = row % SLICE_HEIGHT; 192 193 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 194 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 195 __syncthreads(); 196 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 197 __syncthreads(); 198 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 199 __syncthreads(); 200 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 201 __syncthreads(); 202 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 203 __syncthreads(); 204 if (threadIdx.y < 1) { 205 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 206 y[row] = shared[threadIdx.x]; 207 } 208 } 209 } 210 211 __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 212 { 213 __shared__ MatScalar shared[512]; 214 PetscInt i, row, slice_id, row_in_slice; 215 /* multiple threads per row. */ 216 row = blockIdx.x * blockDim.x + threadIdx.x; 217 if (row < nrows) { 218 slice_id = row / SLICE_HEIGHT; 219 row_in_slice = row % SLICE_HEIGHT; 220 221 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 222 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 223 __syncthreads(); 224 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 225 __syncthreads(); 226 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 227 __syncthreads(); 228 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 229 __syncthreads(); 230 if (threadIdx.y < 1) { 231 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 232 y[row] = shared[threadIdx.x]; 233 } 234 } 235 } 236 237 __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 238 { 239 __shared__ MatScalar shared[512]; 240 PetscInt i, row, slice_id, row_in_slice; 241 /* multiple threads per row. */ 242 row = blockIdx.x * blockDim.x + threadIdx.x; 243 if (row < nrows) { 244 slice_id = row / SLICE_HEIGHT; 245 row_in_slice = row % SLICE_HEIGHT; 246 247 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 248 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 249 __syncthreads(); 250 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 251 __syncthreads(); 252 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 253 __syncthreads(); 254 if (threadIdx.y < 1) { 255 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 256 y[row] = shared[threadIdx.x]; 257 } 258 } 259 } 260 261 __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 262 { 263 __shared__ MatScalar shared[512]; 264 PetscInt i, row, slice_id, row_in_slice; 265 /* multiple threads per row. */ 266 row = blockIdx.x * blockDim.x + threadIdx.x; 267 if (row < nrows) { 268 slice_id = row / SLICE_HEIGHT; 269 row_in_slice = row % SLICE_HEIGHT; 270 271 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 272 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 273 __syncthreads(); 274 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 275 __syncthreads(); 276 if (threadIdx.y < 1) { 277 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 278 y[row] = shared[threadIdx.x]; 279 } 280 } 281 } 282 283 __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 284 { 285 __shared__ MatScalar shared[512]; 286 PetscInt i, row, slice_id, row_in_slice; 287 /* multiple threads per row. */ 288 row = blockIdx.x * blockDim.x + threadIdx.x; 289 if (row < nrows) { 290 slice_id = row / SLICE_HEIGHT; 291 row_in_slice = row % SLICE_HEIGHT; 292 293 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 294 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 295 __syncthreads(); 296 if (threadIdx.y < 1) { 297 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 298 y[row] = shared[threadIdx.x]; 299 } 300 } 301 } 302 303 __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 304 { 305 __shared__ MatScalar shared[512]; 306 PetscInt i, row, slice_id, row_in_slice; 307 /* multiple threads per row. */ 308 row = blockIdx.x * blockDim.x + threadIdx.x; 309 if (row < nrows) { 310 slice_id = row / SLICE_HEIGHT; 311 row_in_slice = row % SLICE_HEIGHT; 312 313 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 314 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 315 __syncthreads(); 316 if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; } 317 __syncthreads(); 318 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 319 __syncthreads(); 320 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 321 __syncthreads(); 322 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 323 __syncthreads(); 324 if (threadIdx.y < 1) { 325 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 326 z[row] = y[row] + shared[threadIdx.x]; 327 } 328 } 329 } 330 331 __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 332 { 333 __shared__ MatScalar shared[512]; 334 PetscInt i, row, slice_id, row_in_slice; 335 /* multiple threads per row. */ 336 row = blockIdx.x * blockDim.x + threadIdx.x; 337 if (row < nrows) { 338 slice_id = row / SLICE_HEIGHT; 339 row_in_slice = row % SLICE_HEIGHT; 340 341 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 342 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 343 __syncthreads(); 344 if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; } 345 __syncthreads(); 346 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 347 __syncthreads(); 348 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 349 __syncthreads(); 350 if (threadIdx.y < 1) { 351 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 352 z[row] = y[row] + shared[threadIdx.x]; 353 } 354 } 355 } 356 357 __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 358 { 359 __shared__ MatScalar shared[512]; 360 PetscInt i, row, slice_id, row_in_slice; 361 /* multiple threads per row. */ 362 row = blockIdx.x * blockDim.x + threadIdx.x; 363 if (row < nrows) { 364 slice_id = row / SLICE_HEIGHT; 365 row_in_slice = row % SLICE_HEIGHT; 366 367 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 368 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 369 __syncthreads(); 370 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 371 __syncthreads(); 372 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 373 __syncthreads(); 374 if (threadIdx.y < 1) { 375 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 376 z[row] = y[row] + shared[threadIdx.x]; 377 } 378 } 379 } 380 381 __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 382 { 383 __shared__ MatScalar shared[512]; 384 PetscInt i, row, slice_id, row_in_slice; 385 /* multiple threads per row. */ 386 row = blockIdx.x * blockDim.x + threadIdx.x; 387 if (row < nrows) { 388 slice_id = row / SLICE_HEIGHT; 389 row_in_slice = row % SLICE_HEIGHT; 390 391 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 392 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 393 __syncthreads(); 394 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 395 __syncthreads(); 396 if (threadIdx.y < 1) { 397 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 398 z[row] = y[row] + shared[threadIdx.x]; 399 } 400 } 401 } 402 403 __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 404 { 405 __shared__ MatScalar shared[512]; 406 PetscInt i, row, slice_id, row_in_slice; 407 /* multiple threads per row. */ 408 row = blockIdx.x * blockDim.x + threadIdx.x; 409 if (row < nrows) { 410 slice_id = row / SLICE_HEIGHT; 411 row_in_slice = row % SLICE_HEIGHT; 412 413 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 414 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 415 __syncthreads(); 416 if (threadIdx.y < 1) { 417 shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; 418 z[row] = y[row] + shared[threadIdx.x]; 419 } 420 } 421 } 422 423 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 424 { 425 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 426 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 427 PetscScalar *y; 428 const PetscScalar *x; 429 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 430 MatScalar *aval; 431 PetscInt *acolidx; 432 PetscInt *sliidx; 433 PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */ 434 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 435 436 PetscFunctionBegin; 437 PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 438 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 439 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 440 aval = cudastruct->val; 441 acolidx = cudastruct->colidx; 442 sliidx = cudastruct->sliidx; 443 444 PetscCall(VecCUDAGetArrayRead(xx, &x)); 445 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 446 PetscCall(PetscLogGpuTimeBegin()); 447 448 switch (cudastruct->kernelchoice) { 449 case 9: 450 nblocks = 1 + (nrows - 1) / sliceheight; 451 if (cudastruct->blocky == 2) { 452 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 453 } else if (cudastruct->blocky == 4) { 454 matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 455 } else if (cudastruct->blocky == 8) { 456 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 457 } else if (cudastruct->blocky == 16) { 458 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 459 } else if (cudastruct->blocky == 32) { 460 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 461 } else { 462 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 463 } 464 break; 465 case 7: 466 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 467 if (cudastruct->blocky == 2) { 468 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 469 } else if (cudastruct->blocky == 4) { 470 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 471 } else if (cudastruct->blocky == 8) { 472 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 473 } else if (cudastruct->blocky == 16) { 474 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 475 } else if (cudastruct->blocky == 32) { 476 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 477 } else { 478 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 479 } 480 break; 481 case 6: 482 nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */ 483 matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y); 484 break; 485 case 5: 486 nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/ 487 matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y); 488 break; 489 case 4: 490 nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */ 491 matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y); 492 break; 493 case 3: 494 nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */ 495 matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y); 496 break; 497 case 2: /* 16 slices per block if blocksize=512 */ 498 nblocks = 1 + (nrows - 1) / (blocksize / 2); 499 matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y); 500 break; 501 case 1: /* 32 slices per block if blocksize=512 */ 502 nblocks = 1 + (nrows - 1) / blocksize; 503 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 504 break; 505 case 0: 506 if (sliceheight * a->maxslicewidth > 20800) { /* important threshold */ 507 nblocks = 1 + (nrows - 1) / sliceheight; 508 matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 509 } else { 510 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 511 if (avgslicesize <= 96) { 512 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 513 matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 514 } else if (avgslicesize <= 432) { 515 nblocks = 1 + (nrows - 1) / sliceheight; 516 matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 517 } else if (avgslicesize <= 2400) { 518 nblocks = 1 + (nrows - 1) / sliceheight; 519 matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 520 } else { 521 nblocks = 1 + (nrows - 1) / sliceheight; 522 matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y); 523 } 524 } 525 break; 526 } 527 PetscCall(PetscLogGpuTimeEnd()); 528 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 529 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 530 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 535 { 536 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 537 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 538 PetscScalar *z; 539 const PetscScalar *y, *x; 540 PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight; 541 MatScalar *aval = cudastruct->val; 542 PetscInt *acolidx = cudastruct->colidx; 543 PetscInt *sliidx = cudastruct->sliidx; 544 545 PetscFunctionBegin; 546 PetscCheck(sliceheight == 16, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 16, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight); 547 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 548 if (a->nz) { 549 PetscInt nblocks, blocksize = 512; 550 dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32); 551 PetscCall(VecCUDAGetArrayRead(xx, &x)); 552 PetscCall(VecCUDAGetArrayRead(yy, &y)); 553 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 554 PetscCall(PetscLogGpuTimeBegin()); 555 556 switch (cudastruct->kernelchoice) { 557 case 9: 558 nblocks = 1 + (nrows - 1) / sliceheight; 559 if (cudastruct->blocky == 2) { 560 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 561 } else if (cudastruct->blocky == 4) { 562 matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 563 } else if (cudastruct->blocky == 8) { 564 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 565 } else if (cudastruct->blocky == 16) { 566 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 567 } else if (cudastruct->blocky == 32) { 568 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 569 } else { 570 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 571 } 572 break; 573 case 7: 574 nblocks = 1 + (nrows - 1) / (2 * sliceheight); 575 if (cudastruct->blocky == 2) { 576 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 577 } else if (cudastruct->blocky == 4) { 578 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 579 } else if (cudastruct->blocky == 8) { 580 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 581 } else if (cudastruct->blocky == 16) { 582 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 583 } else if (cudastruct->blocky == 32) { 584 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 585 } else { 586 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 587 } 588 break; 589 case 6: 590 nblocks = 1 + (nrows - 1) / (blocksize / 32); 591 matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z); 592 break; 593 case 5: 594 nblocks = 1 + (nrows - 1) / (blocksize / 16); 595 matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z); 596 break; 597 case 4: 598 nblocks = 1 + (nrows - 1) / (blocksize / 8); 599 matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z); 600 break; 601 case 3: 602 nblocks = 1 + (nrows - 1) / (blocksize / 4); 603 matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z); 604 break; 605 case 2: 606 nblocks = 1 + (nrows - 1) / (blocksize / 2); 607 matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z); 608 break; 609 case 1: 610 nblocks = 1 + (nrows - 1) / blocksize; 611 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 612 break; 613 case 0: 614 if (sliceheight * a->maxslicewidth > 20800) { 615 nblocks = 1 + (nrows - 1) / sliceheight; 616 matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 617 } else { 618 PetscInt avgslicesize = sliceheight * a->avgslicewidth; 619 if (avgslicesize <= 96) { 620 nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */ 621 matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 622 } else if (avgslicesize <= 432) { 623 nblocks = 1 + (nrows - 1) / sliceheight; 624 matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 625 } else if (avgslicesize <= 2400) { 626 nblocks = 1 + (nrows - 1) / sliceheight; 627 matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 628 } else { 629 nblocks = 1 + (nrows - 1) / sliceheight; 630 matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z); 631 } 632 } 633 break; 634 } 635 PetscCall(PetscLogGpuTimeEnd()); 636 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 637 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 638 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 639 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 640 } else { 641 PetscCall(VecCopy(yy, zz)); 642 } 643 PetscFunctionReturn(PETSC_SUCCESS); 644 } 645 646 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 647 { 648 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 649 PetscInt kernel, blocky; 650 PetscBool flg; 651 652 PetscFunctionBegin; 653 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 654 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg)); 655 if (flg) { 656 PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel); 657 cudastruct->kernelchoice = kernel; 658 } 659 PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg)); 660 if (flg) { 661 PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", kernel); 662 cudastruct->blocky = blocky; 663 } 664 PetscOptionsHeadEnd(); 665 PetscFunctionReturn(PETSC_SUCCESS); 666 } 667 668 PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A) 669 { 670 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 671 672 PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth)); 673 PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth)); 674 PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio)); 675 PetscFunctionReturn(PETSC_SUCCESS); 676 } 677 678 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 679 { 680 PetscFunctionBegin; 681 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 682 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A)); 683 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 684 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 685 A->ops->mult = MatMult_SeqSELLCUDA; 686 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 687 PetscFunctionReturn(PETSC_SUCCESS); 688 } 689 690 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 691 { 692 PetscFunctionBegin; 693 if (A->factortype == MAT_FACTOR_NONE) { 694 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 695 } 696 PetscCall(MatDestroy_SeqSELL(A)); 697 PetscFunctionReturn(PETSC_SUCCESS); 698 } 699 700 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 701 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 702 { 703 PetscFunctionBegin; 704 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 705 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 706 PetscFunctionReturn(PETSC_SUCCESS); 707 } 708 709 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 710 { 711 Mat_SeqSELLCUDA *cudastruct; 712 713 PetscFunctionBegin; 714 PetscCall(PetscFree(B->defaultvectype)); 715 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 716 717 if (!B->spptr) { 718 if (B->factortype == MAT_FACTOR_NONE) { 719 PetscCall(PetscNew(&cudastruct)); 720 B->spptr = cudastruct; 721 } 722 } 723 724 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 725 B->ops->destroy = MatDestroy_SeqSELLCUDA; 726 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 727 B->ops->mult = MatMult_SeqSELLCUDA; 728 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 729 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 730 731 /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */ 732 PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B)); 733 734 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 735 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 736 PetscFunctionReturn(PETSC_SUCCESS); 737 } 738 739 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 740 { 741 PetscFunctionBegin; 742 PetscCall(MatCreate_SeqSELL(B)); 743 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 744 PetscFunctionReturn(PETSC_SUCCESS); 745 } 746