xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 40badf4fbc550ac1f60bd080eaff6de6d55b946d)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
5 {
6   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
7   IS                iscol=a->col,isrow=a->row;
8   const PetscInt    *r,*c,*rout,*cout;
9   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
10   PetscInt          i,nz;
11   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
12   const MatScalar   *aa=a->a,*v;
13   PetscScalar       *x,*s,*t,*ls;
14   const PetscScalar *b;
15 
16   PetscFunctionBegin;
17   CHKERRQ(VecGetArrayRead(bb,&b));
18   CHKERRQ(VecGetArray(xx,&x));
19   t    = a->solve_work;
20 
21   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
22   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
23 
24   /* forward solve the lower triangular */
25   CHKERRQ(PetscArraycpy(t,b+bs*(*r++),bs));
26   for (i=1; i<n; i++) {
27     v    = aa + bs2*ai[i];
28     vi   = aj + ai[i];
29     nz   = a->diag[i] - ai[i];
30     s    = t + bs*i;
31     CHKERRQ(PetscArraycpy(s,b+bs*(*r++),bs));
32     while (nz--) {
33       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
34       v += bs2;
35     }
36   }
37   /* backward solve the upper triangular */
38   ls = a->solve_work + A->cmap->n;
39   for (i=n-1; i>=0; i--) {
40     v    = aa + bs2*(a->diag[i] + 1);
41     vi   = aj + a->diag[i] + 1;
42     nz   = ai[i+1] - a->diag[i] - 1;
43     CHKERRQ(PetscArraycpy(ls,t+i*bs,bs));
44     while (nz--) {
45       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
46       v += bs2;
47     }
48     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
49     CHKERRQ(PetscArraycpy(x + bs*(*c--),t+i*bs,bs));
50   }
51 
52   CHKERRQ(ISRestoreIndices(isrow,&rout));
53   CHKERRQ(ISRestoreIndices(iscol,&cout));
54   CHKERRQ(VecRestoreArrayRead(bb,&b));
55   CHKERRQ(VecRestoreArray(xx,&x));
56   CHKERRQ(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n));
57   PetscFunctionReturn(0);
58 }
59 
60 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
63   IS                iscol=a->col,isrow=a->row;
64   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
65   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
66   PetscInt          i,nz,idx,idt,idc;
67   const MatScalar   *aa=a->a,*v;
68   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
69   const PetscScalar *b;
70 
71   PetscFunctionBegin;
72   CHKERRQ(VecGetArrayRead(bb,&b));
73   CHKERRQ(VecGetArray(xx,&x));
74   t    = a->solve_work;
75 
76   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
77   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
78 
79   /* forward solve the lower triangular */
80   idx  = 7*(*r++);
81   t[0] = b[idx];   t[1] = b[1+idx];
82   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
83   t[5] = b[5+idx]; t[6] = b[6+idx];
84 
85   for (i=1; i<n; i++) {
86     v   = aa + 49*ai[i];
87     vi  = aj + ai[i];
88     nz  = diag[i] - ai[i];
89     idx = 7*(*r++);
90     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
91     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
92     while (nz--) {
93       idx = 7*(*vi++);
94       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
95       x4  = t[3+idx];x5 = t[4+idx];
96       x6  = t[5+idx];x7 = t[6+idx];
97       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
98       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
99       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
100       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
101       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
102       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
103       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
104       v  += 49;
105     }
106     idx      = 7*i;
107     t[idx]   = s1;t[1+idx] = s2;
108     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
109     t[5+idx] = s6;t[6+idx] = s7;
110   }
111   /* backward solve the upper triangular */
112   for (i=n-1; i>=0; i--) {
113     v   = aa + 49*diag[i] + 49;
114     vi  = aj + diag[i] + 1;
115     nz  = ai[i+1] - diag[i] - 1;
116     idt = 7*i;
117     s1  = t[idt];  s2 = t[1+idt];
118     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
119     s6  = t[5+idt];s7 = t[6+idt];
120     while (nz--) {
121       idx = 7*(*vi++);
122       x1  = t[idx];   x2 = t[1+idx];
123       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
124       x6  = t[5+idx]; x7 = t[6+idx];
125       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
126       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
127       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
128       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
129       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
130       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
131       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
132       v  += 49;
133     }
134     idc    = 7*(*c--);
135     v      = aa + 49*diag[i];
136     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
137                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
138     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
139                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
140     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
141                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
142     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
143                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
144     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
145                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
146     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
147                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
148     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
149                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
150   }
151 
152   CHKERRQ(ISRestoreIndices(isrow,&rout));
153   CHKERRQ(ISRestoreIndices(iscol,&cout));
154   CHKERRQ(VecRestoreArrayRead(bb,&b));
155   CHKERRQ(VecRestoreArray(xx,&x));
156   CHKERRQ(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n));
157   PetscFunctionReturn(0);
158 }
159 
160 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
161 {
162   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
163   IS                iscol=a->col,isrow=a->row;
164   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
165   const PetscInt    n=a->mbs,*rout,*cout,*vi;
166   PetscInt          i,nz,idx,idt,idc,m;
167   const MatScalar   *aa=a->a,*v;
168   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
169   const PetscScalar *b;
170 
171   PetscFunctionBegin;
172   CHKERRQ(VecGetArrayRead(bb,&b));
173   CHKERRQ(VecGetArray(xx,&x));
174   t    = a->solve_work;
175 
176   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
177   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
178 
179   /* forward solve the lower triangular */
180   idx  = 7*r[0];
181   t[0] = b[idx];   t[1] = b[1+idx];
182   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
183   t[5] = b[5+idx]; t[6] = b[6+idx];
184 
185   for (i=1; i<n; i++) {
186     v   = aa + 49*ai[i];
187     vi  = aj + ai[i];
188     nz  = ai[i+1] - ai[i];
189     idx = 7*r[i];
190     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
191     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
192     for (m=0; m<nz; m++) {
193       idx = 7*vi[m];
194       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
195       x4  = t[3+idx];x5 = t[4+idx];
196       x6  = t[5+idx];x7 = t[6+idx];
197       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
198       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
199       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
200       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
201       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
202       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
203       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
204       v  += 49;
205     }
206     idx      = 7*i;
207     t[idx]   = s1;t[1+idx] = s2;
208     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
209     t[5+idx] = s6;t[6+idx] = s7;
210   }
211   /* backward solve the upper triangular */
212   for (i=n-1; i>=0; i--) {
213     v   = aa + 49*(adiag[i+1]+1);
214     vi  = aj + adiag[i+1]+1;
215     nz  = adiag[i] - adiag[i+1] - 1;
216     idt = 7*i;
217     s1  = t[idt];  s2 = t[1+idt];
218     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
219     s6  = t[5+idt];s7 = t[6+idt];
220     for (m=0; m<nz; m++) {
221       idx = 7*vi[m];
222       x1  = t[idx];   x2 = t[1+idx];
223       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
224       x6  = t[5+idx]; x7 = t[6+idx];
225       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
226       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
227       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
228       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
229       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
230       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
231       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
232       v  += 49;
233     }
234     idc    = 7*c[i];
235     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
236                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
237     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
238                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
239     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
240                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
241     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
242                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
243     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
244                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
245     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
246                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
247     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
248                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
249   }
250 
251   CHKERRQ(ISRestoreIndices(isrow,&rout));
252   CHKERRQ(ISRestoreIndices(iscol,&cout));
253   CHKERRQ(VecRestoreArrayRead(bb,&b));
254   CHKERRQ(VecRestoreArray(xx,&x));
255   CHKERRQ(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n));
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
260 {
261   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
262   IS                iscol=a->col,isrow=a->row;
263   const PetscInt    *r,*c,*rout,*cout;
264   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
265   PetscInt          i,nz,idx,idt,idc;
266   const MatScalar   *aa=a->a,*v;
267   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
268   const PetscScalar *b;
269 
270   PetscFunctionBegin;
271   CHKERRQ(VecGetArrayRead(bb,&b));
272   CHKERRQ(VecGetArray(xx,&x));
273   t    = a->solve_work;
274 
275   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
276   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
277 
278   /* forward solve the lower triangular */
279   idx  = 6*(*r++);
280   t[0] = b[idx];   t[1] = b[1+idx];
281   t[2] = b[2+idx]; t[3] = b[3+idx];
282   t[4] = b[4+idx]; t[5] = b[5+idx];
283   for (i=1; i<n; i++) {
284     v   = aa + 36*ai[i];
285     vi  = aj + ai[i];
286     nz  = diag[i] - ai[i];
287     idx = 6*(*r++);
288     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
289     s5  = b[4+idx]; s6 = b[5+idx];
290     while (nz--) {
291       idx = 6*(*vi++);
292       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
293       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
294       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
295       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
296       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
297       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
298       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
299       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
300       v  += 36;
301     }
302     idx      = 6*i;
303     t[idx]   = s1;t[1+idx] = s2;
304     t[2+idx] = s3;t[3+idx] = s4;
305     t[4+idx] = s5;t[5+idx] = s6;
306   }
307   /* backward solve the upper triangular */
308   for (i=n-1; i>=0; i--) {
309     v   = aa + 36*diag[i] + 36;
310     vi  = aj + diag[i] + 1;
311     nz  = ai[i+1] - diag[i] - 1;
312     idt = 6*i;
313     s1  = t[idt];  s2 = t[1+idt];
314     s3  = t[2+idt];s4 = t[3+idt];
315     s5  = t[4+idt];s6 = t[5+idt];
316     while (nz--) {
317       idx = 6*(*vi++);
318       x1  = t[idx];   x2 = t[1+idx];
319       x3  = t[2+idx]; x4 = t[3+idx];
320       x5  = t[4+idx]; x6 = t[5+idx];
321       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
322       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
323       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
324       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
325       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
326       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
327       v  += 36;
328     }
329     idc    = 6*(*c--);
330     v      = aa + 36*diag[i];
331     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
332                         v[18]*s4+v[24]*s5+v[30]*s6;
333     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
334                           v[19]*s4+v[25]*s5+v[31]*s6;
335     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
336                           v[20]*s4+v[26]*s5+v[32]*s6;
337     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
338                           v[21]*s4+v[27]*s5+v[33]*s6;
339     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
340                           v[22]*s4+v[28]*s5+v[34]*s6;
341     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
342                           v[23]*s4+v[29]*s5+v[35]*s6;
343   }
344 
345   CHKERRQ(ISRestoreIndices(isrow,&rout));
346   CHKERRQ(ISRestoreIndices(iscol,&cout));
347   CHKERRQ(VecRestoreArrayRead(bb,&b));
348   CHKERRQ(VecRestoreArray(xx,&x));
349   CHKERRQ(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n));
350   PetscFunctionReturn(0);
351 }
352 
353 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
354 {
355   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
356   IS                iscol=a->col,isrow=a->row;
357   const PetscInt    *r,*c,*rout,*cout;
358   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
359   PetscInt          i,nz,idx,idt,idc,m;
360   const MatScalar   *aa=a->a,*v;
361   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
362   const PetscScalar *b;
363 
364   PetscFunctionBegin;
365   CHKERRQ(VecGetArrayRead(bb,&b));
366   CHKERRQ(VecGetArray(xx,&x));
367   t    = a->solve_work;
368 
369   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
370   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
371 
372   /* forward solve the lower triangular */
373   idx  = 6*r[0];
374   t[0] = b[idx];   t[1] = b[1+idx];
375   t[2] = b[2+idx]; t[3] = b[3+idx];
376   t[4] = b[4+idx]; t[5] = b[5+idx];
377   for (i=1; i<n; i++) {
378     v   = aa + 36*ai[i];
379     vi  = aj + ai[i];
380     nz  = ai[i+1] - ai[i];
381     idx = 6*r[i];
382     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
383     s5  = b[4+idx]; s6 = b[5+idx];
384     for (m=0; m<nz; m++) {
385       idx = 6*vi[m];
386       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
387       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
388       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
389       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
390       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
391       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
392       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
393       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
394       v  += 36;
395     }
396     idx      = 6*i;
397     t[idx]   = s1;t[1+idx] = s2;
398     t[2+idx] = s3;t[3+idx] = s4;
399     t[4+idx] = s5;t[5+idx] = s6;
400   }
401   /* backward solve the upper triangular */
402   for (i=n-1; i>=0; i--) {
403     v   = aa + 36*(adiag[i+1]+1);
404     vi  = aj + adiag[i+1]+1;
405     nz  = adiag[i] - adiag[i+1] - 1;
406     idt = 6*i;
407     s1  = t[idt];  s2 = t[1+idt];
408     s3  = t[2+idt];s4 = t[3+idt];
409     s5  = t[4+idt];s6 = t[5+idt];
410     for (m=0; m<nz; m++) {
411       idx = 6*vi[m];
412       x1  = t[idx];   x2 = t[1+idx];
413       x3  = t[2+idx]; x4 = t[3+idx];
414       x5  = t[4+idx]; x6 = t[5+idx];
415       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
416       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
417       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
418       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
419       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
420       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
421       v  += 36;
422     }
423     idc    = 6*c[i];
424     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
425                         v[18]*s4+v[24]*s5+v[30]*s6;
426     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
427                           v[19]*s4+v[25]*s5+v[31]*s6;
428     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
429                           v[20]*s4+v[26]*s5+v[32]*s6;
430     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
431                           v[21]*s4+v[27]*s5+v[33]*s6;
432     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
433                           v[22]*s4+v[28]*s5+v[34]*s6;
434     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
435                           v[23]*s4+v[29]*s5+v[35]*s6;
436   }
437 
438   CHKERRQ(ISRestoreIndices(isrow,&rout));
439   CHKERRQ(ISRestoreIndices(iscol,&cout));
440   CHKERRQ(VecRestoreArrayRead(bb,&b));
441   CHKERRQ(VecRestoreArray(xx,&x));
442   CHKERRQ(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n));
443   PetscFunctionReturn(0);
444 }
445 
446 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
447 {
448   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
449   IS                iscol=a->col,isrow=a->row;
450   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
451   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
452   PetscInt          i,nz,idx,idt,idc;
453   const MatScalar   *aa=a->a,*v;
454   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
455   const PetscScalar *b;
456 
457   PetscFunctionBegin;
458   CHKERRQ(VecGetArrayRead(bb,&b));
459   CHKERRQ(VecGetArray(xx,&x));
460   t    = a->solve_work;
461 
462   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
463   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
464 
465   /* forward solve the lower triangular */
466   idx  = 5*(*r++);
467   t[0] = b[idx];   t[1] = b[1+idx];
468   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
469   for (i=1; i<n; i++) {
470     v   = aa + 25*ai[i];
471     vi  = aj + ai[i];
472     nz  = diag[i] - ai[i];
473     idx = 5*(*r++);
474     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
475     s5  = b[4+idx];
476     while (nz--) {
477       idx = 5*(*vi++);
478       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
479       x4  = t[3+idx];x5 = t[4+idx];
480       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
481       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
482       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
483       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
484       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
485       v  += 25;
486     }
487     idx      = 5*i;
488     t[idx]   = s1;t[1+idx] = s2;
489     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
490   }
491   /* backward solve the upper triangular */
492   for (i=n-1; i>=0; i--) {
493     v   = aa + 25*diag[i] + 25;
494     vi  = aj + diag[i] + 1;
495     nz  = ai[i+1] - diag[i] - 1;
496     idt = 5*i;
497     s1  = t[idt];  s2 = t[1+idt];
498     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
499     while (nz--) {
500       idx = 5*(*vi++);
501       x1  = t[idx];   x2 = t[1+idx];
502       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
503       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
504       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
505       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
506       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
507       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
508       v  += 25;
509     }
510     idc    = 5*(*c--);
511     v      = aa + 25*diag[i];
512     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
513                         v[15]*s4+v[20]*s5;
514     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
515                           v[16]*s4+v[21]*s5;
516     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
517                           v[17]*s4+v[22]*s5;
518     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
519                           v[18]*s4+v[23]*s5;
520     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
521                           v[19]*s4+v[24]*s5;
522   }
523 
524   CHKERRQ(ISRestoreIndices(isrow,&rout));
525   CHKERRQ(ISRestoreIndices(iscol,&cout));
526   CHKERRQ(VecRestoreArrayRead(bb,&b));
527   CHKERRQ(VecRestoreArray(xx,&x));
528   CHKERRQ(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n));
529   PetscFunctionReturn(0);
530 }
531 
532 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
533 {
534   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
535   IS                iscol=a->col,isrow=a->row;
536   const PetscInt    *r,*c,*rout,*cout;
537   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
538   PetscInt          i,nz,idx,idt,idc,m;
539   const MatScalar   *aa=a->a,*v;
540   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
541   const PetscScalar *b;
542 
543   PetscFunctionBegin;
544   CHKERRQ(VecGetArrayRead(bb,&b));
545   CHKERRQ(VecGetArray(xx,&x));
546   t    = a->solve_work;
547 
548   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
549   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
550 
551   /* forward solve the lower triangular */
552   idx  = 5*r[0];
553   t[0] = b[idx];   t[1] = b[1+idx];
554   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
555   for (i=1; i<n; i++) {
556     v   = aa + 25*ai[i];
557     vi  = aj + ai[i];
558     nz  = ai[i+1] - ai[i];
559     idx = 5*r[i];
560     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
561     s5  = b[4+idx];
562     for (m=0; m<nz; m++) {
563       idx = 5*vi[m];
564       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
565       x4  = t[3+idx];x5 = t[4+idx];
566       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
567       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
568       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
569       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
570       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
571       v  += 25;
572     }
573     idx      = 5*i;
574     t[idx]   = s1;t[1+idx] = s2;
575     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
576   }
577   /* backward solve the upper triangular */
578   for (i=n-1; i>=0; i--) {
579     v   = aa + 25*(adiag[i+1]+1);
580     vi  = aj + adiag[i+1]+1;
581     nz  = adiag[i] - adiag[i+1] - 1;
582     idt = 5*i;
583     s1  = t[idt];  s2 = t[1+idt];
584     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
585     for (m=0; m<nz; m++) {
586       idx = 5*vi[m];
587       x1  = t[idx];   x2 = t[1+idx];
588       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
589       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
590       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
591       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
592       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
593       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
594       v  += 25;
595     }
596     idc    = 5*c[i];
597     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
598                         v[15]*s4+v[20]*s5;
599     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
600                           v[16]*s4+v[21]*s5;
601     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
602                           v[17]*s4+v[22]*s5;
603     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
604                           v[18]*s4+v[23]*s5;
605     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
606                           v[19]*s4+v[24]*s5;
607   }
608 
609   CHKERRQ(ISRestoreIndices(isrow,&rout));
610   CHKERRQ(ISRestoreIndices(iscol,&cout));
611   CHKERRQ(VecRestoreArrayRead(bb,&b));
612   CHKERRQ(VecRestoreArray(xx,&x));
613   CHKERRQ(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n));
614   PetscFunctionReturn(0);
615 }
616 
617 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
618 {
619   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
620   IS                iscol=a->col,isrow=a->row;
621   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
622   PetscInt          i,nz,idx,idt,idc;
623   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
624   const MatScalar   *aa=a->a,*v;
625   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
626   const PetscScalar *b;
627 
628   PetscFunctionBegin;
629   CHKERRQ(VecGetArrayRead(bb,&b));
630   CHKERRQ(VecGetArray(xx,&x));
631   t    = a->solve_work;
632 
633   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
634   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
635 
636   /* forward solve the lower triangular */
637   idx  = 4*(*r++);
638   t[0] = b[idx];   t[1] = b[1+idx];
639   t[2] = b[2+idx]; t[3] = b[3+idx];
640   for (i=1; i<n; i++) {
641     v   = aa + 16*ai[i];
642     vi  = aj + ai[i];
643     nz  = diag[i] - ai[i];
644     idx = 4*(*r++);
645     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
646     while (nz--) {
647       idx = 4*(*vi++);
648       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
649       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
650       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
651       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
652       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
653       v  += 16;
654     }
655     idx      = 4*i;
656     t[idx]   = s1;t[1+idx] = s2;
657     t[2+idx] = s3;t[3+idx] = s4;
658   }
659   /* backward solve the upper triangular */
660   for (i=n-1; i>=0; i--) {
661     v   = aa + 16*diag[i] + 16;
662     vi  = aj + diag[i] + 1;
663     nz  = ai[i+1] - diag[i] - 1;
664     idt = 4*i;
665     s1  = t[idt];  s2 = t[1+idt];
666     s3  = t[2+idt];s4 = t[3+idt];
667     while (nz--) {
668       idx = 4*(*vi++);
669       x1  = t[idx];   x2 = t[1+idx];
670       x3  = t[2+idx]; x4 = t[3+idx];
671       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
672       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
673       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
674       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
675       v  += 16;
676     }
677     idc      = 4*(*c--);
678     v        = aa + 16*diag[i];
679     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
680     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
681     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
682     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
683   }
684 
685   CHKERRQ(ISRestoreIndices(isrow,&rout));
686   CHKERRQ(ISRestoreIndices(iscol,&cout));
687   CHKERRQ(VecRestoreArrayRead(bb,&b));
688   CHKERRQ(VecRestoreArray(xx,&x));
689   CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
690   PetscFunctionReturn(0);
691 }
692 
693 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
694 {
695   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
696   IS                iscol=a->col,isrow=a->row;
697   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
698   PetscInt          i,nz,idx,idt,idc,m;
699   const PetscInt    *r,*c,*rout,*cout;
700   const MatScalar   *aa=a->a,*v;
701   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
702   const PetscScalar *b;
703 
704   PetscFunctionBegin;
705   CHKERRQ(VecGetArrayRead(bb,&b));
706   CHKERRQ(VecGetArray(xx,&x));
707   t    = a->solve_work;
708 
709   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
710   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
711 
712   /* forward solve the lower triangular */
713   idx  = 4*r[0];
714   t[0] = b[idx];   t[1] = b[1+idx];
715   t[2] = b[2+idx]; t[3] = b[3+idx];
716   for (i=1; i<n; i++) {
717     v   = aa + 16*ai[i];
718     vi  = aj + ai[i];
719     nz  = ai[i+1] - ai[i];
720     idx = 4*r[i];
721     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
722     for (m=0; m<nz; m++) {
723       idx = 4*vi[m];
724       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
725       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
726       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
727       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
728       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
729       v  += 16;
730     }
731     idx      = 4*i;
732     t[idx]   = s1;t[1+idx] = s2;
733     t[2+idx] = s3;t[3+idx] = s4;
734   }
735   /* backward solve the upper triangular */
736   for (i=n-1; i>=0; i--) {
737     v   = aa + 16*(adiag[i+1]+1);
738     vi  = aj + adiag[i+1]+1;
739     nz  = adiag[i] - adiag[i+1] - 1;
740     idt = 4*i;
741     s1  = t[idt];  s2 = t[1+idt];
742     s3  = t[2+idt];s4 = t[3+idt];
743     for (m=0; m<nz; m++) {
744       idx = 4*vi[m];
745       x1  = t[idx];   x2 = t[1+idx];
746       x3  = t[2+idx]; x4 = t[3+idx];
747       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
748       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
749       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
750       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
751       v  += 16;
752     }
753     idc      = 4*c[i];
754     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
755     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
756     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
757     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
758   }
759 
760   CHKERRQ(ISRestoreIndices(isrow,&rout));
761   CHKERRQ(ISRestoreIndices(iscol,&cout));
762   CHKERRQ(VecRestoreArrayRead(bb,&b));
763   CHKERRQ(VecRestoreArray(xx,&x));
764   CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
765   PetscFunctionReturn(0);
766 }
767 
768 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
769 {
770   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
771   IS                iscol=a->col,isrow=a->row;
772   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
773   PetscInt          i,nz,idx,idt,idc;
774   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
775   const MatScalar   *aa=a->a,*v;
776   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
777   PetscScalar       *x;
778   const PetscScalar *b;
779 
780   PetscFunctionBegin;
781   CHKERRQ(VecGetArrayRead(bb,&b));
782   CHKERRQ(VecGetArray(xx,&x));
783   t    = (MatScalar*)a->solve_work;
784 
785   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
786   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
787 
788   /* forward solve the lower triangular */
789   idx  = 4*(*r++);
790   t[0] = (MatScalar)b[idx];
791   t[1] = (MatScalar)b[1+idx];
792   t[2] = (MatScalar)b[2+idx];
793   t[3] = (MatScalar)b[3+idx];
794   for (i=1; i<n; i++) {
795     v   = aa + 16*ai[i];
796     vi  = aj + ai[i];
797     nz  = diag[i] - ai[i];
798     idx = 4*(*r++);
799     s1  = (MatScalar)b[idx];
800     s2  = (MatScalar)b[1+idx];
801     s3  = (MatScalar)b[2+idx];
802     s4  = (MatScalar)b[3+idx];
803     while (nz--) {
804       idx = 4*(*vi++);
805       x1  = t[idx];
806       x2  = t[1+idx];
807       x3  = t[2+idx];
808       x4  = t[3+idx];
809       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
810       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
811       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
812       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
813       v  += 16;
814     }
815     idx      = 4*i;
816     t[idx]   = s1;
817     t[1+idx] = s2;
818     t[2+idx] = s3;
819     t[3+idx] = s4;
820   }
821   /* backward solve the upper triangular */
822   for (i=n-1; i>=0; i--) {
823     v   = aa + 16*diag[i] + 16;
824     vi  = aj + diag[i] + 1;
825     nz  = ai[i+1] - diag[i] - 1;
826     idt = 4*i;
827     s1  = t[idt];
828     s2  = t[1+idt];
829     s3  = t[2+idt];
830     s4  = t[3+idt];
831     while (nz--) {
832       idx = 4*(*vi++);
833       x1  = t[idx];
834       x2  = t[1+idx];
835       x3  = t[2+idx];
836       x4  = t[3+idx];
837       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
838       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
839       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
840       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
841       v  += 16;
842     }
843     idc      = 4*(*c--);
844     v        = aa + 16*diag[i];
845     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
846     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
847     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
848     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
849     x[idc]   = (PetscScalar)t[idt];
850     x[1+idc] = (PetscScalar)t[1+idt];
851     x[2+idc] = (PetscScalar)t[2+idt];
852     x[3+idc] = (PetscScalar)t[3+idt];
853   }
854 
855   CHKERRQ(ISRestoreIndices(isrow,&rout));
856   CHKERRQ(ISRestoreIndices(iscol,&cout));
857   CHKERRQ(VecRestoreArrayRead(bb,&b));
858   CHKERRQ(VecRestoreArray(xx,&x));
859   CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
860   PetscFunctionReturn(0);
861 }
862 
863 #if defined(PETSC_HAVE_SSE)
864 
865 #include PETSC_HAVE_SSE
866 
867 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
868 {
869   /*
870      Note: This code uses demotion of double
871      to float when performing the mixed-mode computation.
872      This may not be numerically reasonable for all applications.
873   */
874   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
875   IS             iscol=a->col,isrow=a->row;
876   PetscErrorCode ierr;
877   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
878   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
879   MatScalar      *aa=a->a,*v;
880   PetscScalar    *x,*b,*t;
881 
882   /* Make space in temp stack for 16 Byte Aligned arrays */
883   float         ssealignedspace[11],*tmps,*tmpx;
884   unsigned long offset;
885 
886   PetscFunctionBegin;
887   SSE_SCOPE_BEGIN;
888 
889   offset = (unsigned long)ssealignedspace % 16;
890   if (offset) offset = (16 - offset)/4;
891   tmps = &ssealignedspace[offset];
892   tmpx = &ssealignedspace[offset+4];
893   PREFETCH_NTA(aa+16*ai[1]);
894 
895   CHKERRQ(VecGetArray(bb,&b));
896   CHKERRQ(VecGetArray(xx,&x));
897   t    = a->solve_work;
898 
899   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
900   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
901 
902   /* forward solve the lower triangular */
903   idx  = 4*(*r++);
904   t[0] = b[idx];   t[1] = b[1+idx];
905   t[2] = b[2+idx]; t[3] = b[3+idx];
906   v    =  aa + 16*ai[1];
907 
908   for (i=1; i<n;) {
909     PREFETCH_NTA(&v[8]);
910     vi  =  aj      + ai[i];
911     nz  =  diag[i] - ai[i];
912     idx =  4*(*r++);
913 
914     /* Demote sum from double to float */
915     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
916     LOAD_PS(tmps,XMM7);
917 
918     while (nz--) {
919       PREFETCH_NTA(&v[16]);
920       idx = 4*(*vi++);
921 
922       /* Demote solution (so far) from double to float */
923       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
924 
925       /* 4x4 Matrix-Vector product with negative accumulation: */
926       SSE_INLINE_BEGIN_2(tmpx,v)
927       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
928 
929       /* First Column */
930       SSE_COPY_PS(XMM0,XMM6)
931       SSE_SHUFFLE(XMM0,XMM0,0x00)
932       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
933       SSE_SUB_PS(XMM7,XMM0)
934 
935       /* Second Column */
936       SSE_COPY_PS(XMM1,XMM6)
937       SSE_SHUFFLE(XMM1,XMM1,0x55)
938       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
939       SSE_SUB_PS(XMM7,XMM1)
940 
941       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
942 
943       /* Third Column */
944       SSE_COPY_PS(XMM2,XMM6)
945       SSE_SHUFFLE(XMM2,XMM2,0xAA)
946       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
947       SSE_SUB_PS(XMM7,XMM2)
948 
949       /* Fourth Column */
950       SSE_COPY_PS(XMM3,XMM6)
951       SSE_SHUFFLE(XMM3,XMM3,0xFF)
952       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
953       SSE_SUB_PS(XMM7,XMM3)
954       SSE_INLINE_END_2
955 
956       v += 16;
957     }
958     idx = 4*i;
959     v   = aa + 16*ai[++i];
960     PREFETCH_NTA(v);
961     STORE_PS(tmps,XMM7);
962 
963     /* Promote result from float to double */
964     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
965   }
966   /* backward solve the upper triangular */
967   idt  = 4*(n-1);
968   ai16 = 16*diag[n-1];
969   v    = aa + ai16 + 16;
970   for (i=n-1; i>=0;) {
971     PREFETCH_NTA(&v[8]);
972     vi = aj + diag[i] + 1;
973     nz = ai[i+1] - diag[i] - 1;
974 
975     /* Demote accumulator from double to float */
976     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
977     LOAD_PS(tmps,XMM7);
978 
979     while (nz--) {
980       PREFETCH_NTA(&v[16]);
981       idx = 4*(*vi++);
982 
983       /* Demote solution (so far) from double to float */
984       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
985 
986       /* 4x4 Matrix-Vector Product with negative accumulation: */
987       SSE_INLINE_BEGIN_2(tmpx,v)
988       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
989 
990       /* First Column */
991       SSE_COPY_PS(XMM0,XMM6)
992       SSE_SHUFFLE(XMM0,XMM0,0x00)
993       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
994       SSE_SUB_PS(XMM7,XMM0)
995 
996       /* Second Column */
997       SSE_COPY_PS(XMM1,XMM6)
998       SSE_SHUFFLE(XMM1,XMM1,0x55)
999       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1000       SSE_SUB_PS(XMM7,XMM1)
1001 
1002       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1003 
1004       /* Third Column */
1005       SSE_COPY_PS(XMM2,XMM6)
1006       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1007       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1008       SSE_SUB_PS(XMM7,XMM2)
1009 
1010       /* Fourth Column */
1011       SSE_COPY_PS(XMM3,XMM6)
1012       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1013       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1014       SSE_SUB_PS(XMM7,XMM3)
1015       SSE_INLINE_END_2
1016       v += 16;
1017     }
1018     v    = aa + ai16;
1019     ai16 = 16*diag[--i];
1020     PREFETCH_NTA(aa+ai16+16);
1021     /*
1022        Scale the result by the diagonal 4x4 block,
1023        which was inverted as part of the factorization
1024     */
1025     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1026     /* First Column */
1027     SSE_COPY_PS(XMM0,XMM7)
1028     SSE_SHUFFLE(XMM0,XMM0,0x00)
1029     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1030 
1031     /* Second Column */
1032     SSE_COPY_PS(XMM1,XMM7)
1033     SSE_SHUFFLE(XMM1,XMM1,0x55)
1034     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1035     SSE_ADD_PS(XMM0,XMM1)
1036 
1037     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1038 
1039     /* Third Column */
1040     SSE_COPY_PS(XMM2,XMM7)
1041     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1042     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1043     SSE_ADD_PS(XMM0,XMM2)
1044 
1045     /* Fourth Column */
1046     SSE_COPY_PS(XMM3,XMM7)
1047     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1048     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1049     SSE_ADD_PS(XMM0,XMM3)
1050 
1051     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1052     SSE_INLINE_END_3
1053 
1054     /* Promote solution from float to double */
1055     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1056 
1057     /* Apply reordering to t and stream into x.    */
1058     /* This way, x doesn't pollute the cache.      */
1059     /* Be careful with size: 2 doubles = 4 floats! */
1060     idc = 4*(*c--);
1061     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1062     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1063     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1064     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1065     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1066     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1067     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1068     SSE_INLINE_END_2
1069     v    = aa + ai16 + 16;
1070     idt -= 4;
1071   }
1072 
1073   CHKERRQ(ISRestoreIndices(isrow,&rout));
1074   CHKERRQ(ISRestoreIndices(iscol,&cout));
1075   CHKERRQ(VecRestoreArray(bb,&b));
1076   CHKERRQ(VecRestoreArray(xx,&x));
1077   CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
1078   SSE_SCOPE_END;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 #endif
1083 
1084 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1085 {
1086   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1087   IS                iscol=a->col,isrow=a->row;
1088   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1089   PetscInt          i,nz,idx,idt,idc;
1090   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1091   const MatScalar   *aa=a->a,*v;
1092   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1093   const PetscScalar *b;
1094 
1095   PetscFunctionBegin;
1096   CHKERRQ(VecGetArrayRead(bb,&b));
1097   CHKERRQ(VecGetArray(xx,&x));
1098   t    = a->solve_work;
1099 
1100   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1101   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
1102 
1103   /* forward solve the lower triangular */
1104   idx  = 3*(*r++);
1105   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1106   for (i=1; i<n; i++) {
1107     v   = aa + 9*ai[i];
1108     vi  = aj + ai[i];
1109     nz  = diag[i] - ai[i];
1110     idx = 3*(*r++);
1111     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1112     while (nz--) {
1113       idx = 3*(*vi++);
1114       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1115       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1116       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1117       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1118       v  += 9;
1119     }
1120     idx    = 3*i;
1121     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1122   }
1123   /* backward solve the upper triangular */
1124   for (i=n-1; i>=0; i--) {
1125     v   = aa + 9*diag[i] + 9;
1126     vi  = aj + diag[i] + 1;
1127     nz  = ai[i+1] - diag[i] - 1;
1128     idt = 3*i;
1129     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1130     while (nz--) {
1131       idx = 3*(*vi++);
1132       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1133       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1134       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1135       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1136       v  += 9;
1137     }
1138     idc      = 3*(*c--);
1139     v        = aa + 9*diag[i];
1140     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1141     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1142     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1143   }
1144   CHKERRQ(ISRestoreIndices(isrow,&rout));
1145   CHKERRQ(ISRestoreIndices(iscol,&cout));
1146   CHKERRQ(VecRestoreArrayRead(bb,&b));
1147   CHKERRQ(VecRestoreArray(xx,&x));
1148   CHKERRQ(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n));
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1153 {
1154   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1155   IS                iscol=a->col,isrow=a->row;
1156   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1157   PetscInt          i,nz,idx,idt,idc,m;
1158   const PetscInt    *r,*c,*rout,*cout;
1159   const MatScalar   *aa=a->a,*v;
1160   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1161   const PetscScalar *b;
1162 
1163   PetscFunctionBegin;
1164   CHKERRQ(VecGetArrayRead(bb,&b));
1165   CHKERRQ(VecGetArray(xx,&x));
1166   t    = a->solve_work;
1167 
1168   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1169   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
1170 
1171   /* forward solve the lower triangular */
1172   idx  = 3*r[0];
1173   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1174   for (i=1; i<n; i++) {
1175     v   = aa + 9*ai[i];
1176     vi  = aj + ai[i];
1177     nz  = ai[i+1] - ai[i];
1178     idx = 3*r[i];
1179     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1180     for (m=0; m<nz; m++) {
1181       idx = 3*vi[m];
1182       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1183       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1184       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1185       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1186       v  += 9;
1187     }
1188     idx    = 3*i;
1189     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1190   }
1191   /* backward solve the upper triangular */
1192   for (i=n-1; i>=0; i--) {
1193     v   = aa + 9*(adiag[i+1]+1);
1194     vi  = aj + adiag[i+1]+1;
1195     nz  = adiag[i] - adiag[i+1] - 1;
1196     idt = 3*i;
1197     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1198     for (m=0; m<nz; m++) {
1199       idx = 3*vi[m];
1200       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1201       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1202       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1203       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1204       v  += 9;
1205     }
1206     idc      = 3*c[i];
1207     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1208     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1209     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1210   }
1211   CHKERRQ(ISRestoreIndices(isrow,&rout));
1212   CHKERRQ(ISRestoreIndices(iscol,&cout));
1213   CHKERRQ(VecRestoreArrayRead(bb,&b));
1214   CHKERRQ(VecRestoreArray(xx,&x));
1215   CHKERRQ(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n));
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1220 {
1221   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1222   IS                iscol=a->col,isrow=a->row;
1223   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1224   PetscInt          i,nz,idx,idt,idc;
1225   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1226   const MatScalar   *aa=a->a,*v;
1227   PetscScalar       *x,s1,s2,x1,x2,*t;
1228   const PetscScalar *b;
1229 
1230   PetscFunctionBegin;
1231   CHKERRQ(VecGetArrayRead(bb,&b));
1232   CHKERRQ(VecGetArray(xx,&x));
1233   t    = a->solve_work;
1234 
1235   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1236   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
1237 
1238   /* forward solve the lower triangular */
1239   idx  = 2*(*r++);
1240   t[0] = b[idx]; t[1] = b[1+idx];
1241   for (i=1; i<n; i++) {
1242     v   = aa + 4*ai[i];
1243     vi  = aj + ai[i];
1244     nz  = diag[i] - ai[i];
1245     idx = 2*(*r++);
1246     s1  = b[idx]; s2 = b[1+idx];
1247     while (nz--) {
1248       idx = 2*(*vi++);
1249       x1  = t[idx]; x2 = t[1+idx];
1250       s1 -= v[0]*x1 + v[2]*x2;
1251       s2 -= v[1]*x1 + v[3]*x2;
1252       v  += 4;
1253     }
1254     idx    = 2*i;
1255     t[idx] = s1; t[1+idx] = s2;
1256   }
1257   /* backward solve the upper triangular */
1258   for (i=n-1; i>=0; i--) {
1259     v   = aa + 4*diag[i] + 4;
1260     vi  = aj + diag[i] + 1;
1261     nz  = ai[i+1] - diag[i] - 1;
1262     idt = 2*i;
1263     s1  = t[idt]; s2 = t[1+idt];
1264     while (nz--) {
1265       idx = 2*(*vi++);
1266       x1  = t[idx]; x2 = t[1+idx];
1267       s1 -= v[0]*x1 + v[2]*x2;
1268       s2 -= v[1]*x1 + v[3]*x2;
1269       v  += 4;
1270     }
1271     idc      = 2*(*c--);
1272     v        = aa + 4*diag[i];
1273     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1274     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1275   }
1276   CHKERRQ(ISRestoreIndices(isrow,&rout));
1277   CHKERRQ(ISRestoreIndices(iscol,&cout));
1278   CHKERRQ(VecRestoreArrayRead(bb,&b));
1279   CHKERRQ(VecRestoreArray(xx,&x));
1280   CHKERRQ(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n));
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1285 {
1286   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1287   IS                iscol=a->col,isrow=a->row;
1288   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1289   PetscInt          i,nz,idx,jdx,idt,idc,m;
1290   const PetscInt    *r,*c,*rout,*cout;
1291   const MatScalar   *aa=a->a,*v;
1292   PetscScalar       *x,s1,s2,x1,x2,*t;
1293   const PetscScalar *b;
1294 
1295   PetscFunctionBegin;
1296   CHKERRQ(VecGetArrayRead(bb,&b));
1297   CHKERRQ(VecGetArray(xx,&x));
1298   t    = a->solve_work;
1299 
1300   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1301   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
1302 
1303   /* forward solve the lower triangular */
1304   idx  = 2*r[0];
1305   t[0] = b[idx]; t[1] = b[1+idx];
1306   for (i=1; i<n; i++) {
1307     v   = aa + 4*ai[i];
1308     vi  = aj + ai[i];
1309     nz  = ai[i+1] - ai[i];
1310     idx = 2*r[i];
1311     s1  = b[idx]; s2 = b[1+idx];
1312     for (m=0; m<nz; m++) {
1313       jdx = 2*vi[m];
1314       x1  = t[jdx]; x2 = t[1+jdx];
1315       s1 -= v[0]*x1 + v[2]*x2;
1316       s2 -= v[1]*x1 + v[3]*x2;
1317       v  += 4;
1318     }
1319     idx    = 2*i;
1320     t[idx] = s1; t[1+idx] = s2;
1321   }
1322   /* backward solve the upper triangular */
1323   for (i=n-1; i>=0; i--) {
1324     v   = aa + 4*(adiag[i+1]+1);
1325     vi  = aj + adiag[i+1]+1;
1326     nz  = adiag[i] - adiag[i+1] - 1;
1327     idt = 2*i;
1328     s1  = t[idt]; s2 = t[1+idt];
1329     for (m=0; m<nz; m++) {
1330       idx = 2*vi[m];
1331       x1  = t[idx]; x2 = t[1+idx];
1332       s1 -= v[0]*x1 + v[2]*x2;
1333       s2 -= v[1]*x1 + v[3]*x2;
1334       v  += 4;
1335     }
1336     idc      = 2*c[i];
1337     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1338     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1339   }
1340   CHKERRQ(ISRestoreIndices(isrow,&rout));
1341   CHKERRQ(ISRestoreIndices(iscol,&cout));
1342   CHKERRQ(VecRestoreArrayRead(bb,&b));
1343   CHKERRQ(VecRestoreArray(xx,&x));
1344   CHKERRQ(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n));
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1349 {
1350   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1351   IS                iscol=a->col,isrow=a->row;
1352   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1353   PetscInt          i,nz;
1354   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1355   const MatScalar   *aa=a->a,*v;
1356   PetscScalar       *x,s1,*t;
1357   const PetscScalar *b;
1358 
1359   PetscFunctionBegin;
1360   if (!n) PetscFunctionReturn(0);
1361 
1362   CHKERRQ(VecGetArrayRead(bb,&b));
1363   CHKERRQ(VecGetArray(xx,&x));
1364   t    = a->solve_work;
1365 
1366   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1367   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1);
1368 
1369   /* forward solve the lower triangular */
1370   t[0] = b[*r++];
1371   for (i=1; i<n; i++) {
1372     v  = aa + ai[i];
1373     vi = aj + ai[i];
1374     nz = diag[i] - ai[i];
1375     s1 = b[*r++];
1376     while (nz--) {
1377       s1 -= (*v++)*t[*vi++];
1378     }
1379     t[i] = s1;
1380   }
1381   /* backward solve the upper triangular */
1382   for (i=n-1; i>=0; i--) {
1383     v  = aa + diag[i] + 1;
1384     vi = aj + diag[i] + 1;
1385     nz = ai[i+1] - diag[i] - 1;
1386     s1 = t[i];
1387     while (nz--) {
1388       s1 -= (*v++)*t[*vi++];
1389     }
1390     x[*c--] = t[i] = aa[diag[i]]*s1;
1391   }
1392 
1393   CHKERRQ(ISRestoreIndices(isrow,&rout));
1394   CHKERRQ(ISRestoreIndices(iscol,&cout));
1395   CHKERRQ(VecRestoreArrayRead(bb,&b));
1396   CHKERRQ(VecRestoreArray(xx,&x));
1397   CHKERRQ(PetscLogFlops(2.0*1*(a->nz) - A->cmap->n));
1398   PetscFunctionReturn(0);
1399 }
1400 
1401 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1402 {
1403   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1404   IS                iscol = a->col,isrow = a->row;
1405   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1406   const PetscInt    *rout,*cout,*r,*c;
1407   PetscScalar       *x,*tmp,sum;
1408   const PetscScalar *b;
1409   const MatScalar   *aa = a->a,*v;
1410 
1411   PetscFunctionBegin;
1412   if (!n) PetscFunctionReturn(0);
1413 
1414   CHKERRQ(VecGetArrayRead(bb,&b));
1415   CHKERRQ(VecGetArray(xx,&x));
1416   tmp  = a->solve_work;
1417 
1418   CHKERRQ(ISGetIndices(isrow,&rout)); r = rout;
1419   CHKERRQ(ISGetIndices(iscol,&cout)); c = cout;
1420 
1421   /* forward solve the lower triangular */
1422   tmp[0] = b[r[0]];
1423   v      = aa;
1424   vi     = aj;
1425   for (i=1; i<n; i++) {
1426     nz  = ai[i+1] - ai[i];
1427     sum = b[r[i]];
1428     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1429     tmp[i] = sum;
1430     v     += nz; vi += nz;
1431   }
1432 
1433   /* backward solve the upper triangular */
1434   for (i=n-1; i>=0; i--) {
1435     v   = aa + adiag[i+1]+1;
1436     vi  = aj + adiag[i+1]+1;
1437     nz  = adiag[i]-adiag[i+1]-1;
1438     sum = tmp[i];
1439     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1440     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1441   }
1442 
1443   CHKERRQ(ISRestoreIndices(isrow,&rout));
1444   CHKERRQ(ISRestoreIndices(iscol,&cout));
1445   CHKERRQ(VecRestoreArrayRead(bb,&b));
1446   CHKERRQ(VecRestoreArray(xx,&x));
1447   CHKERRQ(PetscLogFlops(2.0*a->nz - A->cmap->n));
1448   PetscFunctionReturn(0);
1449 }
1450