xref: /petsc/src/mat/impls/sell/seq/seqcuda/sellcuda.cu (revision 2d1451d43b73a0495cd81c074cbc1e0206888947)
1*2d1451d4SHong Zhang #include <cuda_runtime.h>
2*2d1451d4SHong Zhang 
3*2d1451d4SHong Zhang #include <petscdevice_cuda.h>
4*2d1451d4SHong Zhang #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5*2d1451d4SHong Zhang 
6*2d1451d4SHong Zhang typedef struct {
7*2d1451d4SHong Zhang   PetscInt  *colidx; /* column index */
8*2d1451d4SHong Zhang   MatScalar *val;
9*2d1451d4SHong Zhang   PetscInt  *sliidx;
10*2d1451d4SHong Zhang   PetscInt   nonzerostate;
11*2d1451d4SHong Zhang } Mat_SeqSELLCUDA;
12*2d1451d4SHong Zhang 
13*2d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
14*2d1451d4SHong Zhang {
15*2d1451d4SHong Zhang   PetscFunctionBegin;
16*2d1451d4SHong Zhang   if (*cudastruct) {
17*2d1451d4SHong Zhang     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
18*2d1451d4SHong Zhang     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
19*2d1451d4SHong Zhang     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
20*2d1451d4SHong Zhang     PetscCall(PetscFree(*cudastruct));
21*2d1451d4SHong Zhang   }
22*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
23*2d1451d4SHong Zhang }
24*2d1451d4SHong Zhang 
25*2d1451d4SHong Zhang static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
26*2d1451d4SHong Zhang {
27*2d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
28*2d1451d4SHong Zhang   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;
29*2d1451d4SHong Zhang 
30*2d1451d4SHong Zhang   PetscFunctionBegin;
31*2d1451d4SHong Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
32*2d1451d4SHong Zhang     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
33*2d1451d4SHong Zhang     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
34*2d1451d4SHong Zhang       /* copy values only */
35*2d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
36*2d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
37*2d1451d4SHong Zhang     } else {
38*2d1451d4SHong Zhang       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
39*2d1451d4SHong Zhang       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
40*2d1451d4SHong Zhang       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
41*2d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->sliidx), (a->totalslices + 1) * sizeof(PetscInt)));
42*2d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->colidx), a->maxallocmat * sizeof(PetscInt)));
43*2d1451d4SHong Zhang       PetscCallCUDA(cudaMalloc((void **)&(cudastruct->val), a->maxallocmat * sizeof(MatScalar)));
44*2d1451d4SHong Zhang       /* copy values, nz or maxallocmat? */
45*2d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice));
46*2d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(PetscInt), cudaMemcpyHostToDevice));
47*2d1451d4SHong Zhang       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
48*2d1451d4SHong Zhang       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1) * sizeof(PetscInt)));
49*2d1451d4SHong Zhang       cudastruct->nonzerostate = A->nonzerostate;
50*2d1451d4SHong Zhang     }
51*2d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
52*2d1451d4SHong Zhang     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
53*2d1451d4SHong Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
54*2d1451d4SHong Zhang   }
55*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
56*2d1451d4SHong Zhang }
57*2d1451d4SHong Zhang 
58*2d1451d4SHong Zhang __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
59*2d1451d4SHong Zhang {
60*2d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
61*2d1451d4SHong Zhang   MatScalar sum;
62*2d1451d4SHong Zhang   /* one thread per row. */
63*2d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
64*2d1451d4SHong Zhang   if (row < nrows) {
65*2d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
66*2d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
67*2d1451d4SHong Zhang     if (slice_id < totalslices) {
68*2d1451d4SHong Zhang       sum = 0.0;
69*2d1451d4SHong Zhang       for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
70*2d1451d4SHong Zhang       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
71*2d1451d4SHong Zhang         if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = sum;
72*2d1451d4SHong Zhang       } else {
73*2d1451d4SHong Zhang         y[row] = sum;
74*2d1451d4SHong Zhang       }
75*2d1451d4SHong Zhang     }
76*2d1451d4SHong Zhang   }
77*2d1451d4SHong Zhang }
78*2d1451d4SHong Zhang 
79*2d1451d4SHong Zhang __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
80*2d1451d4SHong Zhang {
81*2d1451d4SHong Zhang   PetscInt  i, row, slice_id, row_in_slice;
82*2d1451d4SHong Zhang   MatScalar sum;
83*2d1451d4SHong Zhang   /* one thread per row. */
84*2d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
85*2d1451d4SHong Zhang   if (row < nrows) {
86*2d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
87*2d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
88*2d1451d4SHong Zhang     if (slice_id < totalslices) {
89*2d1451d4SHong Zhang       sum = 0.0;
90*2d1451d4SHong Zhang       for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT) sum += aval[i] * x[acolidx[i]];
91*2d1451d4SHong Zhang       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
92*2d1451d4SHong Zhang         if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + sum;
93*2d1451d4SHong Zhang       } else {
94*2d1451d4SHong Zhang         z[row] = y[row] + sum;
95*2d1451d4SHong Zhang       }
96*2d1451d4SHong Zhang     }
97*2d1451d4SHong Zhang   }
98*2d1451d4SHong Zhang }
99*2d1451d4SHong Zhang 
100*2d1451d4SHong Zhang __global__ void matmult_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
101*2d1451d4SHong Zhang {
102*2d1451d4SHong Zhang   __shared__ MatScalar shared[256];
103*2d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
104*2d1451d4SHong Zhang   /* one thread per row. */
105*2d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
106*2d1451d4SHong Zhang   if (row < nrows) {
107*2d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
108*2d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
109*2d1451d4SHong Zhang 
110*2d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
111*2d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
112*2d1451d4SHong Zhang     if (blockDim.y > 4) {
113*2d1451d4SHong Zhang       __syncthreads();
114*2d1451d4SHong Zhang       if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
115*2d1451d4SHong Zhang     }
116*2d1451d4SHong Zhang     if (blockDim.y > 2) {
117*2d1451d4SHong Zhang       __syncthreads();
118*2d1451d4SHong Zhang       if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
119*2d1451d4SHong Zhang     }
120*2d1451d4SHong Zhang     if (blockDim.y > 1) {
121*2d1451d4SHong Zhang       __syncthreads();
122*2d1451d4SHong Zhang       if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; }
123*2d1451d4SHong Zhang     }
124*2d1451d4SHong Zhang     if (threadIdx.y < 1) {
125*2d1451d4SHong Zhang       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
126*2d1451d4SHong Zhang         if (row_in_slice < (nrows % SLICE_HEIGHT)) y[row] = shared[threadIdx.x];
127*2d1451d4SHong Zhang       } else {
128*2d1451d4SHong Zhang         y[row] = shared[threadIdx.x];
129*2d1451d4SHong Zhang       }
130*2d1451d4SHong Zhang     }
131*2d1451d4SHong Zhang   }
132*2d1451d4SHong Zhang }
133*2d1451d4SHong Zhang 
134*2d1451d4SHong Zhang __global__ void matmultadd_seqsell_tiled_kernel(PetscInt nrows, PetscInt totalslices, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
135*2d1451d4SHong Zhang {
136*2d1451d4SHong Zhang   __shared__ MatScalar shared[256];
137*2d1451d4SHong Zhang   PetscInt             i, row, slice_id, row_in_slice;
138*2d1451d4SHong Zhang   /* one thread per row. */
139*2d1451d4SHong Zhang   row = blockIdx.x * blockDim.x + threadIdx.x;
140*2d1451d4SHong Zhang   if (row < nrows) {
141*2d1451d4SHong Zhang     slice_id     = row / SLICE_HEIGHT;
142*2d1451d4SHong Zhang     row_in_slice = row % SLICE_HEIGHT;
143*2d1451d4SHong Zhang 
144*2d1451d4SHong Zhang     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
145*2d1451d4SHong Zhang     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
146*2d1451d4SHong Zhang     if (blockDim.y > 4) {
147*2d1451d4SHong Zhang       __syncthreads();
148*2d1451d4SHong Zhang       if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
149*2d1451d4SHong Zhang     }
150*2d1451d4SHong Zhang     if (blockDim.y > 2) {
151*2d1451d4SHong Zhang       __syncthreads();
152*2d1451d4SHong Zhang       if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
153*2d1451d4SHong Zhang     }
154*2d1451d4SHong Zhang     if (blockDim.y > 1) {
155*2d1451d4SHong Zhang       __syncthreads();
156*2d1451d4SHong Zhang       if (threadIdx.y < 1) { shared[threadIdx.x] += shared[blockDim.x + threadIdx.x]; }
157*2d1451d4SHong Zhang     }
158*2d1451d4SHong Zhang     if (threadIdx.y < 1) {
159*2d1451d4SHong Zhang       if (slice_id == totalslices - 1 && nrows % SLICE_HEIGHT) { /* if last slice has padding rows */
160*2d1451d4SHong Zhang         if (row_in_slice < (nrows % SLICE_HEIGHT)) z[row] = y[row] + shared[threadIdx.x];
161*2d1451d4SHong Zhang       } else {
162*2d1451d4SHong Zhang         z[row] = y[row] + shared[threadIdx.x];
163*2d1451d4SHong Zhang       }
164*2d1451d4SHong Zhang     }
165*2d1451d4SHong Zhang   }
166*2d1451d4SHong Zhang }
167*2d1451d4SHong Zhang 
168*2d1451d4SHong Zhang PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
169*2d1451d4SHong Zhang {
170*2d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
171*2d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
172*2d1451d4SHong Zhang   PetscScalar       *y;
173*2d1451d4SHong Zhang   const PetscScalar *x;
174*2d1451d4SHong Zhang   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
175*2d1451d4SHong Zhang   MatScalar         *aval;
176*2d1451d4SHong Zhang   PetscInt          *acolidx;
177*2d1451d4SHong Zhang   PetscInt          *sliidx;
178*2d1451d4SHong Zhang   PetscInt           nblocks, blocksize = 256;
179*2d1451d4SHong Zhang 
180*2d1451d4SHong Zhang   PetscFunctionBegin;
181*2d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
182*2d1451d4SHong Zhang   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
183*2d1451d4SHong Zhang   aval    = cudastruct->val;
184*2d1451d4SHong Zhang   acolidx = cudastruct->colidx;
185*2d1451d4SHong Zhang   sliidx  = cudastruct->sliidx;
186*2d1451d4SHong Zhang 
187*2d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayRead(xx, &x));
188*2d1451d4SHong Zhang   PetscCall(VecCUDAGetArrayWrite(yy, &y));
189*2d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeBegin());
190*2d1451d4SHong Zhang   nblocks = (nrows + blocksize - 1) / blocksize;
191*2d1451d4SHong Zhang   if (nblocks >= 80) {
192*2d1451d4SHong Zhang     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
193*2d1451d4SHong Zhang   } else {
194*2d1451d4SHong Zhang     PetscInt avg_width;
195*2d1451d4SHong Zhang     dim3     block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8);
196*2d1451d4SHong Zhang     avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices);
197*2d1451d4SHong Zhang     if (avg_width > 64) {
198*2d1451d4SHong Zhang       matmult_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
199*2d1451d4SHong Zhang     } else if (avg_width > 32) {
200*2d1451d4SHong Zhang       matmult_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
201*2d1451d4SHong Zhang     } else if (avg_width > 16) {
202*2d1451d4SHong Zhang       matmult_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
203*2d1451d4SHong Zhang     } else {
204*2d1451d4SHong Zhang       matmult_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y);
205*2d1451d4SHong Zhang     }
206*2d1451d4SHong Zhang   }
207*2d1451d4SHong Zhang   PetscCallCUDA(WaitForCUDA());
208*2d1451d4SHong Zhang   PetscCall(PetscLogGpuTimeEnd());
209*2d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayRead(xx, &x));
210*2d1451d4SHong Zhang   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
211*2d1451d4SHong Zhang   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
212*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
213*2d1451d4SHong Zhang }
214*2d1451d4SHong Zhang 
215*2d1451d4SHong Zhang PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
216*2d1451d4SHong Zhang {
217*2d1451d4SHong Zhang   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
218*2d1451d4SHong Zhang   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
219*2d1451d4SHong Zhang   PetscScalar       *z;
220*2d1451d4SHong Zhang   const PetscScalar *y, *x;
221*2d1451d4SHong Zhang   PetscInt           totalslices = a->totalslices, nrows = A->rmap->n;
222*2d1451d4SHong Zhang   MatScalar         *aval    = cudastruct->val;
223*2d1451d4SHong Zhang   PetscInt          *acolidx = cudastruct->colidx;
224*2d1451d4SHong Zhang   PetscInt          *sliidx  = cudastruct->sliidx;
225*2d1451d4SHong Zhang 
226*2d1451d4SHong Zhang   PetscFunctionBegin;
227*2d1451d4SHong Zhang   PetscCall(MatSeqSELLCUDACopyToGPU(A));
228*2d1451d4SHong Zhang   if (a->nz) {
229*2d1451d4SHong Zhang     PetscInt nblocks, blocksize = 256;
230*2d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(xx, &x));
231*2d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayRead(yy, &y));
232*2d1451d4SHong Zhang     PetscCall(VecCUDAGetArrayWrite(zz, &z));
233*2d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeBegin());
234*2d1451d4SHong Zhang     nblocks = (nrows + blocksize - 1) / blocksize;
235*2d1451d4SHong Zhang     if (nblocks >= 80) {
236*2d1451d4SHong Zhang       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
237*2d1451d4SHong Zhang     } else {
238*2d1451d4SHong Zhang       PetscInt avg_width;
239*2d1451d4SHong Zhang       dim3     block1(256, 1), block2(128, 2), block4(64, 4), block8(32, 8);
240*2d1451d4SHong Zhang       avg_width = a->sliidx[a->totalslices] / (SLICE_HEIGHT * a->totalslices);
241*2d1451d4SHong Zhang       if (avg_width > 64) {
242*2d1451d4SHong Zhang         matmultadd_seqsell_tiled_kernel<<<nblocks * 8, block8>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
243*2d1451d4SHong Zhang       } else if (avg_width > 32) {
244*2d1451d4SHong Zhang         matmultadd_seqsell_tiled_kernel<<<nblocks * 4, block4>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
245*2d1451d4SHong Zhang       } else if (avg_width > 16) {
246*2d1451d4SHong Zhang         matmultadd_seqsell_tiled_kernel<<<nblocks * 2, block2>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
247*2d1451d4SHong Zhang       } else {
248*2d1451d4SHong Zhang         matmultadd_seqsell_tiled_kernel<<<nblocks, block1>>>(nrows, totalslices, acolidx, aval, sliidx, x, y, z);
249*2d1451d4SHong Zhang       }
250*2d1451d4SHong Zhang     }
251*2d1451d4SHong Zhang     PetscCallCUDA(WaitForCUDA());
252*2d1451d4SHong Zhang     PetscCall(PetscLogGpuTimeEnd());
253*2d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(xx, &x));
254*2d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayRead(yy, &y));
255*2d1451d4SHong Zhang     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
256*2d1451d4SHong Zhang     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
257*2d1451d4SHong Zhang   } else {
258*2d1451d4SHong Zhang     PetscCall(VecCopy(yy, zz));
259*2d1451d4SHong Zhang   }
260*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
261*2d1451d4SHong Zhang }
262*2d1451d4SHong Zhang 
263*2d1451d4SHong Zhang static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
264*2d1451d4SHong Zhang {
265*2d1451d4SHong Zhang   PetscFunctionBegin;
266*2d1451d4SHong Zhang   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
267*2d1451d4SHong Zhang   PetscOptionsHeadEnd();
268*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
269*2d1451d4SHong Zhang }
270*2d1451d4SHong Zhang 
271*2d1451d4SHong Zhang static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
272*2d1451d4SHong Zhang {
273*2d1451d4SHong Zhang   PetscFunctionBegin;
274*2d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
275*2d1451d4SHong Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
276*2d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
277*2d1451d4SHong Zhang   A->ops->mult    = MatMult_SeqSELLCUDA;
278*2d1451d4SHong Zhang   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
279*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
280*2d1451d4SHong Zhang }
281*2d1451d4SHong Zhang 
282*2d1451d4SHong Zhang static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
283*2d1451d4SHong Zhang {
284*2d1451d4SHong Zhang   PetscFunctionBegin;
285*2d1451d4SHong Zhang   if (A->factortype == MAT_FACTOR_NONE) {
286*2d1451d4SHong Zhang     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) { PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr)); }
287*2d1451d4SHong Zhang   }
288*2d1451d4SHong Zhang   PetscCall(MatDestroy_SeqSELL(A));
289*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
290*2d1451d4SHong Zhang }
291*2d1451d4SHong Zhang 
292*2d1451d4SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
293*2d1451d4SHong Zhang static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
294*2d1451d4SHong Zhang {
295*2d1451d4SHong Zhang   PetscFunctionBegin;
296*2d1451d4SHong Zhang   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
297*2d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
298*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
299*2d1451d4SHong Zhang }
300*2d1451d4SHong Zhang 
301*2d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
302*2d1451d4SHong Zhang {
303*2d1451d4SHong Zhang   Mat_SeqSELLCUDA *cudastruct;
304*2d1451d4SHong Zhang 
305*2d1451d4SHong Zhang   PetscFunctionBegin;
306*2d1451d4SHong Zhang   PetscCall(PetscFree(B->defaultvectype));
307*2d1451d4SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
308*2d1451d4SHong Zhang 
309*2d1451d4SHong Zhang   if (!B->spptr) {
310*2d1451d4SHong Zhang     if (B->factortype == MAT_FACTOR_NONE) {
311*2d1451d4SHong Zhang       PetscCall(PetscNew(&cudastruct));
312*2d1451d4SHong Zhang       B->spptr = cudastruct;
313*2d1451d4SHong Zhang     }
314*2d1451d4SHong Zhang   }
315*2d1451d4SHong Zhang 
316*2d1451d4SHong Zhang   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
317*2d1451d4SHong Zhang   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
318*2d1451d4SHong Zhang   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
319*2d1451d4SHong Zhang   B->ops->mult           = MatMult_SeqSELLCUDA;
320*2d1451d4SHong Zhang   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
321*2d1451d4SHong Zhang   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
322*2d1451d4SHong Zhang 
323*2d1451d4SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
324*2d1451d4SHong Zhang   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
325*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
326*2d1451d4SHong Zhang }
327*2d1451d4SHong Zhang 
328*2d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
329*2d1451d4SHong Zhang {
330*2d1451d4SHong Zhang   PetscFunctionBegin;
331*2d1451d4SHong Zhang   PetscCall(MatCreate_SeqSELL(B));
332*2d1451d4SHong Zhang   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
333*2d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
334*2d1451d4SHong Zhang }
335