1*2d1451d4SHong Zhang #include <cuda_runtime.h> 2*2d1451d4SHong Zhang 3*2d1451d4SHong Zhang #include <petscdevice_cuda.h> 4*2d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5*2d1451d4SHong Zhang 6*2d1451d4SHong Zhang typedef struct { 7*2d1451d4SHong Zhang PetscInt *colidx; /* column index */ 8*2d1451d4SHong Zhang MatScalar *val; 9*2d1451d4SHong Zhang PetscInt *sliidx; 10*2d1451d4SHong Zhang PetscInt nonzerostate; 11*2d1451d4SHong Zhang } Mat_SeqSELLCUDA; 12*2d1451d4SHong Zhang 13*2d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct) 14*2d1451d4SHong Zhang { 15*2d1451d4SHong Zhang PetscFunctionBegin; 16*2d1451d4SHong Zhang if (*cudastruct) { 17*2d1451d4SHong Zhang if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); } 18*2d1451d4SHong Zhang if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); } 19*2d1451d4SHong Zhang if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); } 20*2d1451d4SHong Zhang PetscCall(PetscFree(*cudastruct)); 21*2d1451d4SHong Zhang } 22*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 23*2d1451d4SHong Zhang } 24*2d1451d4SHong Zhang 25*2d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A) 26*2d1451d4SHong Zhang { 27*2d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 28*2d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 29*2d1451d4SHong Zhang 30*2d1451d4SHong Zhang PetscFunctionBegin; 31*2d1451d4SHong Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 32*2d1451d4SHong Zhang PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0)); 33*2d1451d4SHong Zhang if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) { 34*2d1451d4SHong Zhang /* copy values only */ 35*2d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 36*2d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar)))); 37*2d1451d4SHong Zhang } else { 38*2d1451d4SHong Zhang if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); } 39*2d1451d4SHong Zhang if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); } 40*2d1451d4SHong Zhang if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); } 41*2d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt))); 42*2d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt))); 43*2d1451d4SHong Zhang PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar))); 44*2d1451d4SHong Zhang /* copy values, nz or maxallocmat? */ 45*2d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice)); 46*2d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice)); 47*2d1451d4SHong Zhang PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice)); 48*2d1451d4SHong Zhang PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt))); 49*2d1451d4SHong Zhang cudastruct->nonzerostate = A->nonzerostate; 50*2d1451d4SHong Zhang } 51*2d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 52*2d1451d4SHong Zhang PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0)); 53*2d1451d4SHong Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 54*2d1451d4SHong Zhang } 55*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 56*2d1451d4SHong Zhang } 57*2d1451d4SHong Zhang 58*2d1451d4SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 59*2d1451d4SHong Zhang { 60*2d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 61*2d1451d4SHong Zhang MatScalar sum; 62*2d1451d4SHong Zhang /* one thread per row. */ 63*2d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 64*2d1451d4SHong Zhang if (row < nrows) { 65*2d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 66*2d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 67*2d1451d4SHong Zhang if (slice_id < totalslices) { 68*2d1451d4SHong Zhang sum = 0.0; 69*2d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 70*2d1451d4SHong Zhang if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 71*2d1451d4SHong Zhang if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = sum; 72*2d1451d4SHong Zhang } else { 73*2d1451d4SHong Zhang y[row] = sum; 74*2d1451d4SHong Zhang } 75*2d1451d4SHong Zhang } 76*2d1451d4SHong Zhang } 77*2d1451d4SHong Zhang } 78*2d1451d4SHong Zhang 79*2d1451d4SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 80*2d1451d4SHong Zhang { 81*2d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 82*2d1451d4SHong Zhang MatScalar sum; 83*2d1451d4SHong Zhang /* one thread per row. */ 84*2d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 85*2d1451d4SHong Zhang if (row < nrows) { 86*2d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 87*2d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 88*2d1451d4SHong Zhang if (slice_id < totalslices) { 89*2d1451d4SHong Zhang sum = 0.0; 90*2d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]]; 91*2d1451d4SHong Zhang if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 92*2d1451d4SHong Zhang if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + sum; 93*2d1451d4SHong Zhang } else { 94*2d1451d4SHong Zhang z[row] = y[row] + sum; 95*2d1451d4SHong Zhang } 96*2d1451d4SHong Zhang } 97*2d1451d4SHong Zhang } 98*2d1451d4SHong Zhang } 99*2d1451d4SHong Zhang 100*2d1451d4SHong Zhang __global__ void matmult_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y) 101*2d1451d4SHong Zhang { 102*2d1451d4SHong Zhang __shared__ MatScalar shared[256]; 103*2d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 104*2d1451d4SHong Zhang /* one thread per row. */ 105*2d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 106*2d1451d4SHong Zhang if (row < nrows) { 107*2d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 108*2d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 109*2d1451d4SHong Zhang 110*2d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 111*2d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 112*2d1451d4SHong Zhang if (blockDim.y > 4) { 113*2d1451d4SHong Zhang __syncthreads(); 114*2d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 115*2d1451d4SHong Zhang } 116*2d1451d4SHong Zhang if (blockDim.y > 2) { 117*2d1451d4SHong Zhang __syncthreads(); 118*2d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 119*2d1451d4SHong Zhang } 120*2d1451d4SHong Zhang if (blockDim.y > 1) { 121*2d1451d4SHong Zhang __syncthreads(); 122*2d1451d4SHong Zhang if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; } 123*2d1451d4SHong Zhang } 124*2d1451d4SHong Zhang if (threadIdx.y < 1) { 125*2d1451d4SHong Zhang if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 126*2d1451d4SHong Zhang if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = shared[threadIdx.x]; 127*2d1451d4SHong Zhang } else { 128*2d1451d4SHong Zhang y[row] = shared[threadIdx.x]; 129*2d1451d4SHong Zhang } 130*2d1451d4SHong Zhang } 131*2d1451d4SHong Zhang } 132*2d1451d4SHong Zhang } 133*2d1451d4SHong Zhang 134*2d1451d4SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z) 135*2d1451d4SHong Zhang { 136*2d1451d4SHong Zhang __shared__ MatScalar shared[256]; 137*2d1451d4SHong Zhang PetscInt i, row, slice_id, row_in_slice; 138*2d1451d4SHong Zhang /* one thread per row. */ 139*2d1451d4SHong Zhang row = blockIdx.x * blockDim.x + threadIdx.x; 140*2d1451d4SHong Zhang if (row < nrows) { 141*2d1451d4SHong Zhang slice_id = row / SLICE_HEIGHT; 142*2d1451d4SHong Zhang row_in_slice = row % SLICE_HEIGHT; 143*2d1451d4SHong Zhang 144*2d1451d4SHong Zhang shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0; 145*2d1451d4SHong Zhang for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]]; 146*2d1451d4SHong Zhang if (blockDim.y > 4) { 147*2d1451d4SHong Zhang __syncthreads(); 148*2d1451d4SHong Zhang if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; } 149*2d1451d4SHong Zhang } 150*2d1451d4SHong Zhang if (blockDim.y > 2) { 151*2d1451d4SHong Zhang __syncthreads(); 152*2d1451d4SHong Zhang if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; } 153*2d1451d4SHong Zhang } 154*2d1451d4SHong Zhang if (blockDim.y > 1) { 155*2d1451d4SHong Zhang __syncthreads(); 156*2d1451d4SHong Zhang if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; } 157*2d1451d4SHong Zhang } 158*2d1451d4SHong Zhang if (threadIdx.y < 1) { 159*2d1451d4SHong Zhang if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */ 160*2d1451d4SHong Zhang if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + shared[threadIdx.x]; 161*2d1451d4SHong Zhang } else { 162*2d1451d4SHong Zhang z[row] = y[row] + shared[threadIdx.x]; 163*2d1451d4SHong Zhang } 164*2d1451d4SHong Zhang } 165*2d1451d4SHong Zhang } 166*2d1451d4SHong Zhang } 167*2d1451d4SHong Zhang 168*2d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy) 169*2d1451d4SHong Zhang { 170*2d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 171*2d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 172*2d1451d4SHong Zhang PetscScalar *y; 173*2d1451d4SHong Zhang const PetscScalar *x; 174*2d1451d4SHong Zhang PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 175*2d1451d4SHong Zhang MatScalar *aval; 176*2d1451d4SHong Zhang PetscInt *acolidx; 177*2d1451d4SHong Zhang PetscInt *sliidx; 178*2d1451d4SHong Zhang PetscInt nblocks, blocksize = 256; 179*2d1451d4SHong Zhang 180*2d1451d4SHong Zhang PetscFunctionBegin; 181*2d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 182*2d1451d4SHong Zhang /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */ 183*2d1451d4SHong Zhang aval = cudastruct->val; 184*2d1451d4SHong Zhang acolidx = cudastruct->colidx; 185*2d1451d4SHong Zhang sliidx = cudastruct->sliidx; 186*2d1451d4SHong Zhang 187*2d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 188*2d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(yy, &y)); 189*2d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 190*2d1451d4SHong Zhang nblocks = (nrows + blocksize - 1) / blocksize; 191*2d1451d4SHong Zhang if (nblocks >= 80) { 192*2d1451d4SHong Zhang matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 193*2d1451d4SHong Zhang } else { 194*2d1451d4SHong Zhang PetscInt avg_width; 195*2d1451d4SHong Zhang dim3 block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8); 196*2d1451d4SHong Zhang avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices); 197*2d1451d4SHong Zhang if (avg_width > 64) { 198*2d1451d4SHong Zhang matmult_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 199*2d1451d4SHong Zhang } else if (avg_width > 32) { 200*2d1451d4SHong Zhang matmult_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 201*2d1451d4SHong Zhang } else if (avg_width > 16) { 202*2d1451d4SHong Zhang matmult_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 203*2d1451d4SHong Zhang } else { 204*2d1451d4SHong Zhang matmult_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y); 205*2d1451d4SHong Zhang } 206*2d1451d4SHong Zhang } 207*2d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 208*2d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 209*2d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 210*2d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(yy, &y)); 211*2d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 212*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 213*2d1451d4SHong Zhang } 214*2d1451d4SHong Zhang 215*2d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 216*2d1451d4SHong Zhang { 217*2d1451d4SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 218*2d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr; 219*2d1451d4SHong Zhang PetscScalar *z; 220*2d1451d4SHong Zhang const PetscScalar *y, *x; 221*2d1451d4SHong Zhang PetscInt totalslices = a->totalslices, nrows = A->rmap->n; 222*2d1451d4SHong Zhang MatScalar *aval = cudastruct->val; 223*2d1451d4SHong Zhang PetscInt *acolidx = cudastruct->colidx; 224*2d1451d4SHong Zhang PetscInt *sliidx = cudastruct->sliidx; 225*2d1451d4SHong Zhang 226*2d1451d4SHong Zhang PetscFunctionBegin; 227*2d1451d4SHong Zhang PetscCall(MatSeqSELLCUDACopyToGPU(A)); 228*2d1451d4SHong Zhang if (a->nz) { 229*2d1451d4SHong Zhang PetscInt nblocks, blocksize = 256; 230*2d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(xx, &x)); 231*2d1451d4SHong Zhang PetscCall(VecCUDAGetArrayRead(yy, &y)); 232*2d1451d4SHong Zhang PetscCall(VecCUDAGetArrayWrite(zz, &z)); 233*2d1451d4SHong Zhang PetscCall(PetscLogGpuTimeBegin()); 234*2d1451d4SHong Zhang nblocks = (nrows + blocksize - 1) / blocksize; 235*2d1451d4SHong Zhang if (nblocks >= 80) { 236*2d1451d4SHong Zhang matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 237*2d1451d4SHong Zhang } else { 238*2d1451d4SHong Zhang PetscInt avg_width; 239*2d1451d4SHong Zhang dim3 block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8); 240*2d1451d4SHong Zhang avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices); 241*2d1451d4SHong Zhang if (avg_width > 64) { 242*2d1451d4SHong Zhang matmultadd_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 243*2d1451d4SHong Zhang } else if (avg_width > 32) { 244*2d1451d4SHong Zhang matmultadd_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 245*2d1451d4SHong Zhang } else if (avg_width > 16) { 246*2d1451d4SHong Zhang matmultadd_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 247*2d1451d4SHong Zhang } else { 248*2d1451d4SHong Zhang matmultadd_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z); 249*2d1451d4SHong Zhang } 250*2d1451d4SHong Zhang } 251*2d1451d4SHong Zhang PetscCallCUDA(WaitForCUDA()); 252*2d1451d4SHong Zhang PetscCall(PetscLogGpuTimeEnd()); 253*2d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(xx, &x)); 254*2d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayRead(yy, &y)); 255*2d1451d4SHong Zhang PetscCall(VecCUDARestoreArrayWrite(zz, &z)); 256*2d1451d4SHong Zhang PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 257*2d1451d4SHong Zhang } else { 258*2d1451d4SHong Zhang PetscCall(VecCopy(yy, zz)); 259*2d1451d4SHong Zhang } 260*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 261*2d1451d4SHong Zhang } 262*2d1451d4SHong Zhang 263*2d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject) 264*2d1451d4SHong Zhang { 265*2d1451d4SHong Zhang PetscFunctionBegin; 266*2d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options"); 267*2d1451d4SHong Zhang PetscOptionsHeadEnd(); 268*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 269*2d1451d4SHong Zhang } 270*2d1451d4SHong Zhang 271*2d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode) 272*2d1451d4SHong Zhang { 273*2d1451d4SHong Zhang PetscFunctionBegin; 274*2d1451d4SHong Zhang PetscCall(MatAssemblyEnd_SeqSELL(A, mode)); 275*2d1451d4SHong Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 276*2d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); } 277*2d1451d4SHong Zhang A->ops->mult = MatMult_SeqSELLCUDA; 278*2d1451d4SHong Zhang A->ops->multadd = MatMultAdd_SeqSELLCUDA; 279*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 280*2d1451d4SHong Zhang } 281*2d1451d4SHong Zhang 282*2d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A) 283*2d1451d4SHong Zhang { 284*2d1451d4SHong Zhang PetscFunctionBegin; 285*2d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { 286*2d1451d4SHong Zhang if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); } 287*2d1451d4SHong Zhang } 288*2d1451d4SHong Zhang PetscCall(MatDestroy_SeqSELL(A)); 289*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 290*2d1451d4SHong Zhang } 291*2d1451d4SHong Zhang 292*2d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 293*2d1451d4SHong Zhang static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B) 294*2d1451d4SHong Zhang { 295*2d1451d4SHong Zhang PetscFunctionBegin; 296*2d1451d4SHong Zhang PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B)); 297*2d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B)); 298*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 299*2d1451d4SHong Zhang } 300*2d1451d4SHong Zhang 301*2d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B) 302*2d1451d4SHong Zhang { 303*2d1451d4SHong Zhang Mat_SeqSELLCUDA *cudastruct; 304*2d1451d4SHong Zhang 305*2d1451d4SHong Zhang PetscFunctionBegin; 306*2d1451d4SHong Zhang PetscCall(PetscFree(B->defaultvectype)); 307*2d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 308*2d1451d4SHong Zhang 309*2d1451d4SHong Zhang if (!B->spptr) { 310*2d1451d4SHong Zhang if (B->factortype == MAT_FACTOR_NONE) { 311*2d1451d4SHong Zhang PetscCall(PetscNew(&cudastruct)); 312*2d1451d4SHong Zhang B->spptr = cudastruct; 313*2d1451d4SHong Zhang } 314*2d1451d4SHong Zhang } 315*2d1451d4SHong Zhang 316*2d1451d4SHong Zhang B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA; 317*2d1451d4SHong Zhang B->ops->destroy = MatDestroy_SeqSELLCUDA; 318*2d1451d4SHong Zhang B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA; 319*2d1451d4SHong Zhang B->ops->mult = MatMult_SeqSELLCUDA; 320*2d1451d4SHong Zhang B->ops->multadd = MatMultAdd_SeqSELLCUDA; 321*2d1451d4SHong Zhang B->ops->duplicate = MatDuplicate_SeqSELLCUDA; 322*2d1451d4SHong Zhang 323*2d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA)); 324*2d1451d4SHong Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 325*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 326*2d1451d4SHong Zhang } 327*2d1451d4SHong Zhang 328*2d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B) 329*2d1451d4SHong Zhang { 330*2d1451d4SHong Zhang PetscFunctionBegin; 331*2d1451d4SHong Zhang PetscCall(MatCreate_SeqSELL(B)); 332*2d1451d4SHong Zhang PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B)); 333*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 334*2d1451d4SHong Zhang } 335