1 #include <cuda_runtime.h> 2 3 #include <petscdevice_cuda.h> 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 6 typedef struct { 7 PetscInt *colidx; /* column index */ 8 MatScalar *val; 9 PetscInt *sliidx; 10 PetscInt nonzerostate; 11 } Mat_SeqSELLCUDA; 12 13 static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 14 { 15 PetscFunctionBegin; 16 if (*cudastruct) { 17 if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 18 if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 19 if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 20 PetscCall(PetscFree(*cudastruct)); 21 } 22 PetscFunctionReturn(PETSC_SUCCESS); 23 } 24 25 static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 26 { 27 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 28 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 29 30 PetscFunctionBegin; 31 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 32 PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 33 if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 34 /* copy values only */ 35 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 36 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 37 } else { 38 if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 39 if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 40 if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 41 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 42 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 43 PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 44 /* copy values, nz or maxallocmat? */ 45 PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 46 PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 47 PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 48 PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 49 cudastruct->nonzerostate = A->nonzerostate; 50 } 51 PetscCallCUDA(WaitForCUDA()); 52 PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 53 A->offloadmask = PETSC_OFFLOAD_BOTH; 54 } 55 PetscFunctionReturn(PETSC_SUCCESS); 56 } 57 58 __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 59 { 60 PetscInt i, row, slice_id, row_in_slice; 61 MatScalar sum; 62 /* one thread per row. */ 63 row = blockIdx.x * blockDim.x + threadIdx.x; 64 if (row < nrows) { 65 slice_id = row / SLICE_HEIGHT; 66 row_in_slice = row % SLICE_HEIGHT; 67 if (slice_id < totalslices) { 68 sum = 0.0; 69 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 70 if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 71 if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = sum; 72 } else { 73 y[row] = sum; 74 } 75 } 76 } 77 } 78 79 __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 80 { 81 PetscInt i, row, slice_id, row_in_slice; 82 MatScalar sum; 83 /* one thread per row. */ 84 row = blockIdx.x * blockDim.x + threadIdx.x; 85 if (row < nrows) { 86 slice_id = row / SLICE_HEIGHT; 87 row_in_slice = row % SLICE_HEIGHT; 88 if (slice_id < totalslices) { 89 sum = 0.0; 90 for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 91 if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 92 if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + sum; 93 } else { 94 z[row] = y[row] + sum; 95 } 96 } 97 } 98 } 99 100 __global__ void matmult_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 101 { 102 __shared__ MatScalar shared[256]; 103 PetscInt i, row, slice_id, row_in_slice; 104 /* one thread per row. */ 105 row = blockIdx.x * blockDim.x + threadIdx.x; 106 if (row < nrows) { 107 slice_id = row / SLICE_HEIGHT; 108 row_in_slice = row % SLICE_HEIGHT; 109 110 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 111 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 112 if (blockDim.y > 4) { 113 __syncthreads(); 114 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 115 } 116 if (blockDim.y > 2) { 117 __syncthreads(); 118 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 119 } 120 if (blockDim.y > 1) { 121 __syncthreads(); 122 if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; } 123 } 124 if (threadIdx.y < 1) { 125 if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 126 if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = shared[threadIdx.x]; 127 } else { 128 y[row] = shared[threadIdx.x]; 129 } 130 } 131 } 132 } 133 134 __global__ void matmultadd_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 135 { 136 __shared__ MatScalar shared[256]; 137 PetscInt i, row, slice_id, row_in_slice; 138 /* one thread per row. */ 139 row = blockIdx.x * blockDim.x + threadIdx.x; 140 if (row < nrows) { 141 slice_id = row / SLICE_HEIGHT; 142 row_in_slice = row % SLICE_HEIGHT; 143 144 shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 145 for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 146 if (blockDim.y > 4) { 147 __syncthreads(); 148 if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 149 } 150 if (blockDim.y > 2) { 151 __syncthreads(); 152 if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 153 } 154 if (blockDim.y > 1) { 155 __syncthreads(); 156 if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; } 157 } 158 if (threadIdx.y < 1) { 159 if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 160 if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + shared[threadIdx.x]; 161 } else { 162 z[row] = y[row] + shared[threadIdx.x]; 163 } 164 } 165 } 166 } 167 168 PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 169 { 170 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 171 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 172 PetscScalar *y; 173 const PetscScalar *x; 174 PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 175 MatScalar *aval; 176 PetscInt *acolidx; 177 PetscInt *sliidx; 178 PetscInt nblocks, blocksize = 256; 179 180 PetscFunctionBegin; 181 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 182 /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 183 aval = cudastruct->val; 184 acolidx = cudastruct->colidx; 185 sliidx = cudastruct->sliidx; 186 187 PetscCall(VecCUDAGetArrayRead(xx, &x)); 188 PetscCall(VecCUDAGetArrayWrite(yy, &y)); 189 PetscCall(PetscLogGpuTimeBegin()); 190 nblocks = (nrows + blocksize - 1) / blocksize; 191 if (nblocks >= 80) { 192 matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 193 } else { 194 PetscInt avg_width; 195 dim3 block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8); 196 avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices); 197 if (avg_width > 64) { 198 matmult_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 199 } else if (avg_width > 32) { 200 matmult_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 201 } else if (avg_width > 16) { 202 matmult_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 203 } else { 204 matmult_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 205 } 206 } 207 PetscCallCUDA(WaitForCUDA()); 208 PetscCall(PetscLogGpuTimeEnd()); 209 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 210 PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 211 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 216 { 217 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 218 Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 219 PetscScalar *z; 220 const PetscScalar *y, *x; 221 PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 222 MatScalar *aval = cudastruct->val; 223 PetscInt *acolidx = cudastruct->colidx; 224 PetscInt *sliidx = cudastruct->sliidx; 225 226 PetscFunctionBegin; 227 PetscCall(MatSeqSELLCUDACopyToGPU(A)); 228 if (a->nz) { 229 PetscInt nblocks, blocksize = 256; 230 PetscCall(VecCUDAGetArrayRead(xx, &x)); 231 PetscCall(VecCUDAGetArrayRead(yy, &y)); 232 PetscCall(VecCUDAGetArrayWrite(zz, &z)); 233 PetscCall(PetscLogGpuTimeBegin()); 234 nblocks = (nrows + blocksize - 1) / blocksize; 235 if (nblocks >= 80) { 236 matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 237 } else { 238 PetscInt avg_width; 239 dim3 block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8); 240 avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices); 241 if (avg_width > 64) { 242 matmultadd_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 243 } else if (avg_width > 32) { 244 matmultadd_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 245 } else if (avg_width > 16) { 246 matmultadd_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 247 } else { 248 matmultadd_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 249 } 250 } 251 PetscCallCUDA(WaitForCUDA()); 252 PetscCall(PetscLogGpuTimeEnd()); 253 PetscCall(VecCUDARestoreArrayRead(xx, &x)); 254 PetscCall(VecCUDARestoreArrayRead(yy, &y)); 255 PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 256 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 257 } else { 258 PetscCall(VecCopy(yy, zz)); 259 } 260 PetscFunctionReturn(PETSC_SUCCESS); 261 } 262 263 static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 264 { 265 PetscFunctionBegin; 266 PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 267 PetscOptionsHeadEnd(); 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 272 { 273 PetscFunctionBegin; 274 PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 275 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 276 if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 277 A->ops->mult = MatMult_SeqSELLCUDA; 278 A->ops->multadd = MatMultAdd_SeqSELLCUDA; 279 PetscFunctionReturn(PETSC_SUCCESS); 280 } 281 282 static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 283 { 284 PetscFunctionBegin; 285 if (A->factortype == MAT_FACTOR_NONE) { 286 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 287 } 288 PetscCall(MatDestroy_SeqSELL(A)); 289 PetscFunctionReturn(PETSC_SUCCESS); 290 } 291 292 PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 293 static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 294 { 295 PetscFunctionBegin; 296 PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 297 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 298 PetscFunctionReturn(PETSC_SUCCESS); 299 } 300 301 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 302 { 303 Mat_SeqSELLCUDA *cudastruct; 304 305 PetscFunctionBegin; 306 PetscCall(PetscFree(B->defaultvectype)); 307 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 308 309 if (!B->spptr) { 310 if (B->factortype == MAT_FACTOR_NONE) { 311 PetscCall(PetscNew(&cudastruct)); 312 B->spptr = cudastruct; 313 } 314 } 315 316 B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 317 B->ops->destroy = MatDestroy_SeqSELLCUDA; 318 B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 319 B->ops->mult = MatMult_SeqSELLCUDA; 320 B->ops->multadd = MatMultAdd_SeqSELLCUDA; 321 B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 322 323 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 324 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 325 PetscFunctionReturn(PETSC_SUCCESS); 326 } 327 328 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 329 { 330 PetscFunctionBegin; 331 PetscCall(MatCreate_SeqSELL(B)); 332 PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 333 PetscFunctionReturn(PETSC_SUCCESS); 334 } 335