xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 72ce74bd30d5d09d1371a3eb387704f4af395d1a)
1 /*
2     Factorization code for BAIJ format.
3 */
4 
5 #include "src/mat/impls/baij/seq/baij.h"
6 #include "src/inline/ilu.h"
7 #include "src/inline/dot.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
11 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12 {
13   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
14   PetscErrorCode ierr;
15   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16   int             *diag = a->diag;
17   MatScalar       *aa=a->a,*v;
18   PetscScalar     s1,*x,*b;
19 
20   PetscFunctionBegin;
21   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*x[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PetscLogFlops(2*(a->nz) - A->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   PetscErrorCode ierr;
60   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   PetscScalar     s1,s2,x1,x2;
64   PetscScalar     *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70 
71   /* forward solve the U^T */
72   idx = 0;
73   for (i=0; i<n; i++) {
74 
75     v     = aa + 4*diag[i];
76     /* multiply by the inverse of the block diagonal */
77     x1 = x[idx];   x2 = x[1+idx];
78     s1 = v[0]*x1  +  v[1]*x2;
79     s2 = v[2]*x1  +  v[3]*x2;
80     v += 4;
81 
82     vi    = aj + diag[i] + 1;
83     nz    = ai[i+1] - diag[i] - 1;
84     while (nz--) {
85       oidx = 2*(*vi++);
86       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88       v  += 4;
89     }
90     x[idx]   = s1;x[1+idx] = s2;
91     idx += 2;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + 4*diag[i] - 4;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     idt  = 2*i;
99     s1   = x[idt];  s2 = x[1+idt];
100     while (nz--) {
101       idx   = 2*(*vi--);
102       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104       v -= 4;
105     }
106   }
107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116 {
117   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118   PetscErrorCode ierr;
119   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
120   int             *diag = a->diag,oidx;
121   MatScalar       *aa=a->a,*v;
122   PetscScalar     s1,s2,s3,x1,x2,x3;
123   PetscScalar     *x,*b;
124 
125   PetscFunctionBegin;
126   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
127   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
128   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129 
130   /* forward solve the U^T */
131   idx = 0;
132   for (i=0; i<n; i++) {
133 
134     v     = aa + 9*diag[i];
135     /* multiply by the inverse of the block diagonal */
136     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
137     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
138     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
139     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
140     v += 9;
141 
142     vi    = aj + diag[i] + 1;
143     nz    = ai[i+1] - diag[i] - 1;
144     while (nz--) {
145       oidx = 3*(*vi++);
146       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
147       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
148       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
149       v  += 9;
150     }
151     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
152     idx += 3;
153   }
154   /* backward solve the L^T */
155   for (i=n-1; i>=0; i--){
156     v    = aa + 9*diag[i] - 9;
157     vi   = aj + diag[i] - 1;
158     nz   = diag[i] - ai[i];
159     idt  = 3*i;
160     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
161     while (nz--) {
162       idx   = 3*(*vi--);
163       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
164       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
165       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
166       v -= 9;
167     }
168   }
169   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
170   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
171   PetscLogFlops(2*9*(a->nz) - 3*A->n);
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
177 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
178 {
179   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
180   PetscErrorCode ierr;
181   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182   int             *diag = a->diag,oidx;
183   MatScalar       *aa=a->a,*v;
184   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
185   PetscScalar     *x,*b;
186 
187   PetscFunctionBegin;
188   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191 
192   /* forward solve the U^T */
193   idx = 0;
194   for (i=0; i<n; i++) {
195 
196     v     = aa + 16*diag[i];
197     /* multiply by the inverse of the block diagonal */
198     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
199     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
200     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
201     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
202     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
203     v += 16;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 4*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
210       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
211       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
212       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
213       v  += 16;
214     }
215     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
216     idx += 4;
217   }
218   /* backward solve the L^T */
219   for (i=n-1; i>=0; i--){
220     v    = aa + 16*diag[i] - 16;
221     vi   = aj + diag[i] - 1;
222     nz   = diag[i] - ai[i];
223     idt  = 4*i;
224     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
225     while (nz--) {
226       idx   = 4*(*vi--);
227       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
228       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
229       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
230       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
231       v -= 16;
232     }
233   }
234   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
235   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236   PetscLogFlops(2*16*(a->nz) - 4*A->n);
237   PetscFunctionReturn(0);
238 }
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
242 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
243 {
244   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
245   PetscErrorCode ierr;
246   int  i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
247   int             *diag = a->diag,oidx;
248   MatScalar       *aa=a->a,*v;
249   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
250   PetscScalar     *x,*b;
251 
252   PetscFunctionBegin;
253   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
254   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
255   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256 
257   /* forward solve the U^T */
258   idx = 0;
259   for (i=0; i<n; i++) {
260 
261     v     = aa + 25*diag[i];
262     /* multiply by the inverse of the block diagonal */
263     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
265     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
266     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
267     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
268     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
269     v += 25;
270 
271     vi    = aj + diag[i] + 1;
272     nz    = ai[i+1] - diag[i] - 1;
273     while (nz--) {
274       oidx = 5*(*vi++);
275       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
276       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
277       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
278       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
279       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
280       v  += 25;
281     }
282     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
283     idx += 5;
284   }
285   /* backward solve the L^T */
286   for (i=n-1; i>=0; i--){
287     v    = aa + 25*diag[i] - 25;
288     vi   = aj + diag[i] - 1;
289     nz   = diag[i] - ai[i];
290     idt  = 5*i;
291     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
292     while (nz--) {
293       idx   = 5*(*vi--);
294       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
295       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
296       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
297       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
298       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
299       v -= 25;
300     }
301   }
302   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
303   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
304   PetscLogFlops(2*25*(a->nz) - 5*A->n);
305   PetscFunctionReturn(0);
306 }
307 
308 #undef __FUNCT__
309 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
310 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
311 {
312   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
313   PetscErrorCode ierr;
314   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
315   int             *diag = a->diag,oidx;
316   MatScalar       *aa=a->a,*v;
317   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
318   PetscScalar     *x,*b;
319 
320   PetscFunctionBegin;
321   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
322   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
323   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
324 
325   /* forward solve the U^T */
326   idx = 0;
327   for (i=0; i<n; i++) {
328 
329     v     = aa + 36*diag[i];
330     /* multiply by the inverse of the block diagonal */
331     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
332     x6    = x[5+idx];
333     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
334     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
335     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
336     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
337     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
338     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
339     v += 36;
340 
341     vi    = aj + diag[i] + 1;
342     nz    = ai[i+1] - diag[i] - 1;
343     while (nz--) {
344       oidx = 6*(*vi++);
345       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
346       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
347       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351       v  += 36;
352     }
353     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
354     x[5+idx] = s6;
355     idx += 6;
356   }
357   /* backward solve the L^T */
358   for (i=n-1; i>=0; i--){
359     v    = aa + 36*diag[i] - 36;
360     vi   = aj + diag[i] - 1;
361     nz   = diag[i] - ai[i];
362     idt  = 6*i;
363     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364     s6 = x[5+idt];
365     while (nz--) {
366       idx   = 6*(*vi--);
367       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
368       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
369       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
370       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
371       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
372       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
373       v -= 36;
374     }
375   }
376   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
377   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378   PetscLogFlops(2*36*(a->nz) - 6*A->n);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
384 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
385 {
386   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
387   PetscErrorCode ierr;
388   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
389   int             *diag = a->diag,oidx;
390   MatScalar       *aa=a->a,*v;
391   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
392   PetscScalar     *x,*b;
393 
394   PetscFunctionBegin;
395   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
396   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
397   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
398 
399   /* forward solve the U^T */
400   idx = 0;
401   for (i=0; i<n; i++) {
402 
403     v     = aa + 49*diag[i];
404     /* multiply by the inverse of the block diagonal */
405     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
406     x6    = x[5+idx]; x7 = x[6+idx];
407     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
408     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
409     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
410     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
411     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
412     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
413     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
414     v += 49;
415 
416     vi    = aj + diag[i] + 1;
417     nz    = ai[i+1] - diag[i] - 1;
418     while (nz--) {
419       oidx = 7*(*vi++);
420       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
421       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
422       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
423       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
424       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
425       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
426       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
427       v  += 49;
428     }
429     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
430     x[5+idx] = s6;x[6+idx] = s7;
431     idx += 7;
432   }
433   /* backward solve the L^T */
434   for (i=n-1; i>=0; i--){
435     v    = aa + 49*diag[i] - 49;
436     vi   = aj + diag[i] - 1;
437     nz   = diag[i] - ai[i];
438     idt  = 7*i;
439     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
440     s6 = x[5+idt];s7 = x[6+idt];
441     while (nz--) {
442       idx   = 7*(*vi--);
443       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
444       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
445       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
446       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
447       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
448       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
449       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
450       v -= 49;
451     }
452   }
453   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
454   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
455   PetscLogFlops(2*49*(a->nz) - 7*A->n);
456   PetscFunctionReturn(0);
457 }
458 
459 /*---------------------------------------------------------------------------------------------*/
460 #undef __FUNCT__
461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
462 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
463 {
464   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
465   IS              iscol=a->col,isrow=a->row;
466   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
467   int             *diag = a->diag;
468   MatScalar       *aa=a->a,*v;
469   PetscScalar     s1,*x,*b,*t;
470 
471   PetscFunctionBegin;
472   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474   t  = a->solve_work;
475 
476   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
477   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
478 
479   /* copy the b into temp work space according to permutation */
480   for (i=0; i<n; i++) {
481     t[i] = b[c[i]];
482   }
483 
484   /* forward solve the U^T */
485   for (i=0; i<n; i++) {
486 
487     v     = aa + diag[i];
488     /* multiply by the inverse of the block diagonal */
489     s1    = (*v++)*t[i];
490     vi    = aj + diag[i] + 1;
491     nz    = ai[i+1] - diag[i] - 1;
492     while (nz--) {
493       t[*vi++]  -= (*v++)*s1;
494     }
495     t[i]   = s1;
496   }
497   /* backward solve the L^T */
498   for (i=n-1; i>=0; i--){
499     v    = aa + diag[i] - 1;
500     vi   = aj + diag[i] - 1;
501     nz   = diag[i] - ai[i];
502     s1   = t[i];
503     while (nz--) {
504       t[*vi--]   -=  (*v--)*s1;
505     }
506   }
507 
508   /* copy t into x according to permutation */
509   for (i=0; i<n; i++) {
510     x[r[i]]   = t[i];
511   }
512 
513   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
514   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
515   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
517   PetscLogFlops(2*(a->nz) - A->n);
518   PetscFunctionReturn(0);
519 }
520 
521 #undef __FUNCT__
522 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
523 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
524 {
525   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
526   IS              iscol=a->col,isrow=a->row;
527   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
528   int             *diag = a->diag,ii,ic,ir,oidx;
529   MatScalar       *aa=a->a,*v;
530   PetscScalar     s1,s2,x1,x2;
531   PetscScalar     *x,*b,*t;
532 
533   PetscFunctionBegin;
534   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
536   t  = a->solve_work;
537 
538   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
539   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
540 
541   /* copy the b into temp work space according to permutation */
542   ii = 0;
543   for (i=0; i<n; i++) {
544     ic      = 2*c[i];
545     t[ii]   = b[ic];
546     t[ii+1] = b[ic+1];
547     ii += 2;
548   }
549 
550   /* forward solve the U^T */
551   idx = 0;
552   for (i=0; i<n; i++) {
553 
554     v     = aa + 4*diag[i];
555     /* multiply by the inverse of the block diagonal */
556     x1    = t[idx];   x2 = t[1+idx];
557     s1 = v[0]*x1  +  v[1]*x2;
558     s2 = v[2]*x1  +  v[3]*x2;
559     v += 4;
560 
561     vi    = aj + diag[i] + 1;
562     nz    = ai[i+1] - diag[i] - 1;
563     while (nz--) {
564       oidx = 2*(*vi++);
565       t[oidx]   -= v[0]*s1  +  v[1]*s2;
566       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
567       v  += 4;
568     }
569     t[idx]   = s1;t[1+idx] = s2;
570     idx += 2;
571   }
572   /* backward solve the L^T */
573   for (i=n-1; i>=0; i--){
574     v    = aa + 4*diag[i] - 4;
575     vi   = aj + diag[i] - 1;
576     nz   = diag[i] - ai[i];
577     idt  = 2*i;
578     s1 = t[idt];  s2 = t[1+idt];
579     while (nz--) {
580       idx   = 2*(*vi--);
581       t[idx]   -=  v[0]*s1 +  v[1]*s2;
582       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
583       v -= 4;
584     }
585   }
586 
587   /* copy t into x according to permutation */
588   ii = 0;
589   for (i=0; i<n; i++) {
590     ir      = 2*r[i];
591     x[ir]   = t[ii];
592     x[ir+1] = t[ii+1];
593     ii += 2;
594   }
595 
596   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
597   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
598   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
599   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
600   PetscLogFlops(2*4*(a->nz) - 2*A->n);
601   PetscFunctionReturn(0);
602 }
603 
604 #undef __FUNCT__
605 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
606 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
607 {
608   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
609   IS              iscol=a->col,isrow=a->row;
610   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
611   int             *diag = a->diag,ii,ic,ir,oidx;
612   MatScalar       *aa=a->a,*v;
613   PetscScalar     s1,s2,s3,x1,x2,x3;
614   PetscScalar     *x,*b,*t;
615 
616   PetscFunctionBegin;
617   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
618   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
619   t  = a->solve_work;
620 
621   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
622   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
623 
624   /* copy the b into temp work space according to permutation */
625   ii = 0;
626   for (i=0; i<n; i++) {
627     ic      = 3*c[i];
628     t[ii]   = b[ic];
629     t[ii+1] = b[ic+1];
630     t[ii+2] = b[ic+2];
631     ii += 3;
632   }
633 
634   /* forward solve the U^T */
635   idx = 0;
636   for (i=0; i<n; i++) {
637 
638     v     = aa + 9*diag[i];
639     /* multiply by the inverse of the block diagonal */
640     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
641     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
642     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
643     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
644     v += 9;
645 
646     vi    = aj + diag[i] + 1;
647     nz    = ai[i+1] - diag[i] - 1;
648     while (nz--) {
649       oidx = 3*(*vi++);
650       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
651       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
652       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
653       v  += 9;
654     }
655     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
656     idx += 3;
657   }
658   /* backward solve the L^T */
659   for (i=n-1; i>=0; i--){
660     v    = aa + 9*diag[i] - 9;
661     vi   = aj + diag[i] - 1;
662     nz   = diag[i] - ai[i];
663     idt  = 3*i;
664     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
665     while (nz--) {
666       idx   = 3*(*vi--);
667       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
668       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
669       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
670       v -= 9;
671     }
672   }
673 
674   /* copy t into x according to permutation */
675   ii = 0;
676   for (i=0; i<n; i++) {
677     ir      = 3*r[i];
678     x[ir]   = t[ii];
679     x[ir+1] = t[ii+1];
680     x[ir+2] = t[ii+2];
681     ii += 3;
682   }
683 
684   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
685   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
686   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688   PetscLogFlops(2*9*(a->nz) - 3*A->n);
689   PetscFunctionReturn(0);
690 }
691 
692 #undef __FUNCT__
693 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
694 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
695 {
696   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
697   IS              iscol=a->col,isrow=a->row;
698   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
699   int             *diag = a->diag,ii,ic,ir,oidx;
700   MatScalar       *aa=a->a,*v;
701   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
702   PetscScalar     *x,*b,*t;
703 
704   PetscFunctionBegin;
705   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
706   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
707   t  = a->solve_work;
708 
709   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
710   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
711 
712   /* copy the b into temp work space according to permutation */
713   ii = 0;
714   for (i=0; i<n; i++) {
715     ic      = 4*c[i];
716     t[ii]   = b[ic];
717     t[ii+1] = b[ic+1];
718     t[ii+2] = b[ic+2];
719     t[ii+3] = b[ic+3];
720     ii += 4;
721   }
722 
723   /* forward solve the U^T */
724   idx = 0;
725   for (i=0; i<n; i++) {
726 
727     v     = aa + 16*diag[i];
728     /* multiply by the inverse of the block diagonal */
729     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
730     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
731     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
732     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
733     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
734     v += 16;
735 
736     vi    = aj + diag[i] + 1;
737     nz    = ai[i+1] - diag[i] - 1;
738     while (nz--) {
739       oidx = 4*(*vi++);
740       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
741       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
742       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
743       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
744       v  += 16;
745     }
746     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
747     idx += 4;
748   }
749   /* backward solve the L^T */
750   for (i=n-1; i>=0; i--){
751     v    = aa + 16*diag[i] - 16;
752     vi   = aj + diag[i] - 1;
753     nz   = diag[i] - ai[i];
754     idt  = 4*i;
755     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
756     while (nz--) {
757       idx   = 4*(*vi--);
758       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
759       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
760       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
761       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
762       v -= 16;
763     }
764   }
765 
766   /* copy t into x according to permutation */
767   ii = 0;
768   for (i=0; i<n; i++) {
769     ir      = 4*r[i];
770     x[ir]   = t[ii];
771     x[ir+1] = t[ii+1];
772     x[ir+2] = t[ii+2];
773     x[ir+3] = t[ii+3];
774     ii += 4;
775   }
776 
777   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
778   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
779   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
780   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
781   PetscLogFlops(2*16*(a->nz) - 4*A->n);
782   PetscFunctionReturn(0);
783 }
784 
785 #undef __FUNCT__
786 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
787 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
788 {
789   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
790   IS              iscol=a->col,isrow=a->row;
791   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
792   int             *diag = a->diag,ii,ic,ir,oidx;
793   MatScalar       *aa=a->a,*v;
794   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
795   PetscScalar     *x,*b,*t;
796 
797   PetscFunctionBegin;
798   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
799   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
800   t  = a->solve_work;
801 
802   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
803   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
804 
805   /* copy the b into temp work space according to permutation */
806   ii = 0;
807   for (i=0; i<n; i++) {
808     ic      = 5*c[i];
809     t[ii]   = b[ic];
810     t[ii+1] = b[ic+1];
811     t[ii+2] = b[ic+2];
812     t[ii+3] = b[ic+3];
813     t[ii+4] = b[ic+4];
814     ii += 5;
815   }
816 
817   /* forward solve the U^T */
818   idx = 0;
819   for (i=0; i<n; i++) {
820 
821     v     = aa + 25*diag[i];
822     /* multiply by the inverse of the block diagonal */
823     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
824     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
825     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
826     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
827     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
828     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
829     v += 25;
830 
831     vi    = aj + diag[i] + 1;
832     nz    = ai[i+1] - diag[i] - 1;
833     while (nz--) {
834       oidx = 5*(*vi++);
835       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
836       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
837       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
838       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
839       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
840       v  += 25;
841     }
842     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
843     idx += 5;
844   }
845   /* backward solve the L^T */
846   for (i=n-1; i>=0; i--){
847     v    = aa + 25*diag[i] - 25;
848     vi   = aj + diag[i] - 1;
849     nz   = diag[i] - ai[i];
850     idt  = 5*i;
851     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
852     while (nz--) {
853       idx   = 5*(*vi--);
854       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
855       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
856       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
857       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
858       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
859       v -= 25;
860     }
861   }
862 
863   /* copy t into x according to permutation */
864   ii = 0;
865   for (i=0; i<n; i++) {
866     ir      = 5*r[i];
867     x[ir]   = t[ii];
868     x[ir+1] = t[ii+1];
869     x[ir+2] = t[ii+2];
870     x[ir+3] = t[ii+3];
871     x[ir+4] = t[ii+4];
872     ii += 5;
873   }
874 
875   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
876   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
877   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
878   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
879   PetscLogFlops(2*25*(a->nz) - 5*A->n);
880   PetscFunctionReturn(0);
881 }
882 
883 #undef __FUNCT__
884 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
885 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
886 {
887   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
888   IS              iscol=a->col,isrow=a->row;
889   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
890   int             *diag = a->diag,ii,ic,ir,oidx;
891   MatScalar       *aa=a->a,*v;
892   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
893   PetscScalar     *x,*b,*t;
894 
895   PetscFunctionBegin;
896   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
897   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
898   t  = a->solve_work;
899 
900   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
901   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
902 
903   /* copy the b into temp work space according to permutation */
904   ii = 0;
905   for (i=0; i<n; i++) {
906     ic      = 6*c[i];
907     t[ii]   = b[ic];
908     t[ii+1] = b[ic+1];
909     t[ii+2] = b[ic+2];
910     t[ii+3] = b[ic+3];
911     t[ii+4] = b[ic+4];
912     t[ii+5] = b[ic+5];
913     ii += 6;
914   }
915 
916   /* forward solve the U^T */
917   idx = 0;
918   for (i=0; i<n; i++) {
919 
920     v     = aa + 36*diag[i];
921     /* multiply by the inverse of the block diagonal */
922     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
923     x6    = t[5+idx];
924     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
925     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
926     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
927     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
928     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
929     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
930     v += 36;
931 
932     vi    = aj + diag[i] + 1;
933     nz    = ai[i+1] - diag[i] - 1;
934     while (nz--) {
935       oidx = 6*(*vi++);
936       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
937       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
938       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
939       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
940       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
941       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
942       v  += 36;
943     }
944     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
945     t[5+idx] = s6;
946     idx += 6;
947   }
948   /* backward solve the L^T */
949   for (i=n-1; i>=0; i--){
950     v    = aa + 36*diag[i] - 36;
951     vi   = aj + diag[i] - 1;
952     nz   = diag[i] - ai[i];
953     idt  = 6*i;
954     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
955     s6 = t[5+idt];
956     while (nz--) {
957       idx   = 6*(*vi--);
958       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
959       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
960       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
961       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
962       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
963       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
964       v -= 36;
965     }
966   }
967 
968   /* copy t into x according to permutation */
969   ii = 0;
970   for (i=0; i<n; i++) {
971     ir      = 6*r[i];
972     x[ir]   = t[ii];
973     x[ir+1] = t[ii+1];
974     x[ir+2] = t[ii+2];
975     x[ir+3] = t[ii+3];
976     x[ir+4] = t[ii+4];
977     x[ir+5] = t[ii+5];
978     ii += 6;
979   }
980 
981   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
982   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
983   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
984   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
985   PetscLogFlops(2*36*(a->nz) - 6*A->n);
986   PetscFunctionReturn(0);
987 }
988 
989 #undef __FUNCT__
990 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
991 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
992 {
993   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
994   IS              iscol=a->col,isrow=a->row;
995   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
996   int             *diag = a->diag,ii,ic,ir,oidx;
997   MatScalar       *aa=a->a,*v;
998   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
999   PetscScalar     *x,*b,*t;
1000 
1001   PetscFunctionBegin;
1002   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1003   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1004   t  = a->solve_work;
1005 
1006   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1007   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1008 
1009   /* copy the b into temp work space according to permutation */
1010   ii = 0;
1011   for (i=0; i<n; i++) {
1012     ic      = 7*c[i];
1013     t[ii]   = b[ic];
1014     t[ii+1] = b[ic+1];
1015     t[ii+2] = b[ic+2];
1016     t[ii+3] = b[ic+3];
1017     t[ii+4] = b[ic+4];
1018     t[ii+5] = b[ic+5];
1019     t[ii+6] = b[ic+6];
1020     ii += 7;
1021   }
1022 
1023   /* forward solve the U^T */
1024   idx = 0;
1025   for (i=0; i<n; i++) {
1026 
1027     v     = aa + 49*diag[i];
1028     /* multiply by the inverse of the block diagonal */
1029     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1030     x6    = t[5+idx]; x7 = t[6+idx];
1031     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1032     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1033     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1034     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1035     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1036     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1037     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1038     v += 49;
1039 
1040     vi    = aj + diag[i] + 1;
1041     nz    = ai[i+1] - diag[i] - 1;
1042     while (nz--) {
1043       oidx = 7*(*vi++);
1044       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1045       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1046       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1047       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1048       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1049       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1050       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1051       v  += 49;
1052     }
1053     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1054     t[5+idx] = s6;t[6+idx] = s7;
1055     idx += 7;
1056   }
1057   /* backward solve the L^T */
1058   for (i=n-1; i>=0; i--){
1059     v    = aa + 49*diag[i] - 49;
1060     vi   = aj + diag[i] - 1;
1061     nz   = diag[i] - ai[i];
1062     idt  = 7*i;
1063     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1064     s6 = t[5+idt];s7 = t[6+idt];
1065     while (nz--) {
1066       idx   = 7*(*vi--);
1067       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1068       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1069       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1070       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1071       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1072       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1073       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1074       v -= 49;
1075     }
1076   }
1077 
1078   /* copy t into x according to permutation */
1079   ii = 0;
1080   for (i=0; i<n; i++) {
1081     ir      = 7*r[i];
1082     x[ir]   = t[ii];
1083     x[ir+1] = t[ii+1];
1084     x[ir+2] = t[ii+2];
1085     x[ir+3] = t[ii+3];
1086     x[ir+4] = t[ii+4];
1087     x[ir+5] = t[ii+5];
1088     x[ir+6] = t[ii+6];
1089     ii += 7;
1090   }
1091 
1092   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1093   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1094   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1095   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1096   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /* ----------------------------------------------------------- */
1101 #undef __FUNCT__
1102 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1103 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1104 {
1105   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1106   IS              iscol=a->col,isrow=a->row;
1107   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1108   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1109   MatScalar       *aa=a->a,*v;
1110   PetscScalar     *x,*b,*s,*t,*ls;
1111 
1112   PetscFunctionBegin;
1113   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1114   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1115   t  = a->solve_work;
1116 
1117   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1118   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1119 
1120   /* forward solve the lower triangular */
1121   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1122   for (i=1; i<n; i++) {
1123     v   = aa + bs2*ai[i];
1124     vi  = aj + ai[i];
1125     nz  = a->diag[i] - ai[i];
1126     s = t + bs*i;
1127     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132   }
1133   /* backward solve the upper triangular */
1134   ls = a->solve_work + A->n;
1135   for (i=n-1; i>=0; i--){
1136     v   = aa + bs2*(a->diag[i] + 1);
1137     vi  = aj + a->diag[i] + 1;
1138     nz  = ai[i+1] - a->diag[i] - 1;
1139     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1140     while (nz--) {
1141       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1142       v += bs2;
1143     }
1144     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1145     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1146   }
1147 
1148   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1149   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1150   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1151   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1152   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 #undef __FUNCT__
1157 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1158 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1159 {
1160   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1161   IS              iscol=a->col,isrow=a->row;
1162   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1163   int             *diag = a->diag;
1164   MatScalar       *aa=a->a,*v;
1165   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1166   PetscScalar     *x,*b,*t;
1167 
1168   PetscFunctionBegin;
1169   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1171   t  = a->solve_work;
1172 
1173   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1174   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1175 
1176   /* forward solve the lower triangular */
1177   idx    = 7*(*r++);
1178   t[0] = b[idx];   t[1] = b[1+idx];
1179   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1180   t[5] = b[5+idx]; t[6] = b[6+idx];
1181 
1182   for (i=1; i<n; i++) {
1183     v     = aa + 49*ai[i];
1184     vi    = aj + ai[i];
1185     nz    = diag[i] - ai[i];
1186     idx   = 7*(*r++);
1187     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1188     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1189     while (nz--) {
1190       idx   = 7*(*vi++);
1191       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1192       x4    = t[3+idx];x5 = t[4+idx];
1193       x6    = t[5+idx];x7 = t[6+idx];
1194       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1195       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1196       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1197       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1198       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1199       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1200       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1201       v += 49;
1202     }
1203     idx = 7*i;
1204     t[idx]   = s1;t[1+idx] = s2;
1205     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1206     t[5+idx] = s6;t[6+idx] = s7;
1207   }
1208   /* backward solve the upper triangular */
1209   for (i=n-1; i>=0; i--){
1210     v    = aa + 49*diag[i] + 49;
1211     vi   = aj + diag[i] + 1;
1212     nz   = ai[i+1] - diag[i] - 1;
1213     idt  = 7*i;
1214     s1 = t[idt];  s2 = t[1+idt];
1215     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1216     s6 = t[5+idt];s7 = t[6+idt];
1217     while (nz--) {
1218       idx   = 7*(*vi++);
1219       x1    = t[idx];   x2 = t[1+idx];
1220       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1221       x6    = t[5+idx]; x7 = t[6+idx];
1222       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1223       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1224       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1225       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1226       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1227       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1228       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1229       v += 49;
1230     }
1231     idc = 7*(*c--);
1232     v   = aa + 49*diag[i];
1233     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1234                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1235     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1236                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1237     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1238                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1239     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1240                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1241     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1242                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1243     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1244                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1245     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1246                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1247   }
1248 
1249   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1250   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1251   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1252   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1253   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #undef __FUNCT__
1258 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1259 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1260 {
1261   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1262   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1263   PetscErrorCode ierr;
1264   int *diag = a->diag,jdx;
1265   MatScalar       *aa=a->a,*v;
1266   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1267 
1268   PetscFunctionBegin;
1269   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1270   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1271   /* forward solve the lower triangular */
1272   idx    = 0;
1273   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1274   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1275   x[6] = b[6+idx];
1276   for (i=1; i<n; i++) {
1277     v     =  aa + 49*ai[i];
1278     vi    =  aj + ai[i];
1279     nz    =  diag[i] - ai[i];
1280     idx   =  7*i;
1281     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1282     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1283     s7  =  b[6+idx];
1284     while (nz--) {
1285       jdx   = 7*(*vi++);
1286       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1287       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1288       x7    = x[6+jdx];
1289       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1290       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1291       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1292       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1293       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1294       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1295       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1296       v += 49;
1297      }
1298     x[idx]   = s1;
1299     x[1+idx] = s2;
1300     x[2+idx] = s3;
1301     x[3+idx] = s4;
1302     x[4+idx] = s5;
1303     x[5+idx] = s6;
1304     x[6+idx] = s7;
1305   }
1306   /* backward solve the upper triangular */
1307   for (i=n-1; i>=0; i--){
1308     v    = aa + 49*diag[i] + 49;
1309     vi   = aj + diag[i] + 1;
1310     nz   = ai[i+1] - diag[i] - 1;
1311     idt  = 7*i;
1312     s1 = x[idt];   s2 = x[1+idt];
1313     s3 = x[2+idt]; s4 = x[3+idt];
1314     s5 = x[4+idt]; s6 = x[5+idt];
1315     s7 = x[6+idt];
1316     while (nz--) {
1317       idx   = 7*(*vi++);
1318       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1319       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1320       x7    = x[6+idx];
1321       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1322       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1323       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1324       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1325       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1326       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1327       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1328       v += 49;
1329     }
1330     v        = aa + 49*diag[i];
1331     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1332                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1333     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1334                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1335     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1336                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1337     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1338                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1339     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1340                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1341     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1342                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1343     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1344                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1345   }
1346 
1347   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1348   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1349   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1350   PetscFunctionReturn(0);
1351 }
1352 
1353 #undef __FUNCT__
1354 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1355 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1356 {
1357   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1358   IS              iscol=a->col,isrow=a->row;
1359   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1360   int             *diag = a->diag;
1361   MatScalar       *aa=a->a,*v;
1362   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1363 
1364   PetscFunctionBegin;
1365   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1366   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1367   t  = a->solve_work;
1368 
1369   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1370   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1371 
1372   /* forward solve the lower triangular */
1373   idx    = 6*(*r++);
1374   t[0] = b[idx];   t[1] = b[1+idx];
1375   t[2] = b[2+idx]; t[3] = b[3+idx];
1376   t[4] = b[4+idx]; t[5] = b[5+idx];
1377   for (i=1; i<n; i++) {
1378     v     = aa + 36*ai[i];
1379     vi    = aj + ai[i];
1380     nz    = diag[i] - ai[i];
1381     idx   = 6*(*r++);
1382     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1383     s5  = b[4+idx]; s6 = b[5+idx];
1384     while (nz--) {
1385       idx   = 6*(*vi++);
1386       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1387       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1388       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1389       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1390       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1391       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1392       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1393       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1394       v += 36;
1395     }
1396     idx = 6*i;
1397     t[idx]   = s1;t[1+idx] = s2;
1398     t[2+idx] = s3;t[3+idx] = s4;
1399     t[4+idx] = s5;t[5+idx] = s6;
1400   }
1401   /* backward solve the upper triangular */
1402   for (i=n-1; i>=0; i--){
1403     v    = aa + 36*diag[i] + 36;
1404     vi   = aj + diag[i] + 1;
1405     nz   = ai[i+1] - diag[i] - 1;
1406     idt  = 6*i;
1407     s1 = t[idt];  s2 = t[1+idt];
1408     s3 = t[2+idt];s4 = t[3+idt];
1409     s5 = t[4+idt];s6 = t[5+idt];
1410     while (nz--) {
1411       idx   = 6*(*vi++);
1412       x1    = t[idx];   x2 = t[1+idx];
1413       x3    = t[2+idx]; x4 = t[3+idx];
1414       x5    = t[4+idx]; x6 = t[5+idx];
1415       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1416       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1417       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1418       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1419       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1420       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1421       v += 36;
1422     }
1423     idc = 6*(*c--);
1424     v   = aa + 36*diag[i];
1425     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1426                                  v[18]*s4+v[24]*s5+v[30]*s6;
1427     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1428                                  v[19]*s4+v[25]*s5+v[31]*s6;
1429     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1430                                  v[20]*s4+v[26]*s5+v[32]*s6;
1431     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1432                                  v[21]*s4+v[27]*s5+v[33]*s6;
1433     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1434                                  v[22]*s4+v[28]*s5+v[34]*s6;
1435     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1436                                  v[23]*s4+v[29]*s5+v[35]*s6;
1437   }
1438 
1439   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1440   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1443   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1449 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1450 {
1451   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1452   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1453   PetscErrorCode ierr;
1454   int *diag = a->diag,jdx;
1455   MatScalar       *aa=a->a,*v;
1456   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1457 
1458   PetscFunctionBegin;
1459   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1461   /* forward solve the lower triangular */
1462   idx    = 0;
1463   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1464   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1465   for (i=1; i<n; i++) {
1466     v     =  aa + 36*ai[i];
1467     vi    =  aj + ai[i];
1468     nz    =  diag[i] - ai[i];
1469     idx   =  6*i;
1470     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1471     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1472     while (nz--) {
1473       jdx   = 6*(*vi++);
1474       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1475       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1476       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1477       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1478       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1479       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1480       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1481       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1482       v += 36;
1483      }
1484     x[idx]   = s1;
1485     x[1+idx] = s2;
1486     x[2+idx] = s3;
1487     x[3+idx] = s4;
1488     x[4+idx] = s5;
1489     x[5+idx] = s6;
1490   }
1491   /* backward solve the upper triangular */
1492   for (i=n-1; i>=0; i--){
1493     v    = aa + 36*diag[i] + 36;
1494     vi   = aj + diag[i] + 1;
1495     nz   = ai[i+1] - diag[i] - 1;
1496     idt  = 6*i;
1497     s1 = x[idt];   s2 = x[1+idt];
1498     s3 = x[2+idt]; s4 = x[3+idt];
1499     s5 = x[4+idt]; s6 = x[5+idt];
1500     while (nz--) {
1501       idx   = 6*(*vi++);
1502       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1503       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1504       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1505       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1506       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1507       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1508       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1509       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1510       v += 36;
1511     }
1512     v        = aa + 36*diag[i];
1513     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1514     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1515     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1516     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1517     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1518     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1519   }
1520 
1521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1523   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 #undef __FUNCT__
1528 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1529 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1530 {
1531   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1532   IS              iscol=a->col,isrow=a->row;
1533   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1534   int             *diag = a->diag;
1535   MatScalar       *aa=a->a,*v;
1536   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1537 
1538   PetscFunctionBegin;
1539   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1540   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1541   t  = a->solve_work;
1542 
1543   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1544   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1545 
1546   /* forward solve the lower triangular */
1547   idx    = 5*(*r++);
1548   t[0] = b[idx];   t[1] = b[1+idx];
1549   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1550   for (i=1; i<n; i++) {
1551     v     = aa + 25*ai[i];
1552     vi    = aj + ai[i];
1553     nz    = diag[i] - ai[i];
1554     idx   = 5*(*r++);
1555     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1556     s5  = b[4+idx];
1557     while (nz--) {
1558       idx   = 5*(*vi++);
1559       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1560       x4    = t[3+idx];x5 = t[4+idx];
1561       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1562       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1563       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1564       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1565       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1566       v += 25;
1567     }
1568     idx = 5*i;
1569     t[idx]   = s1;t[1+idx] = s2;
1570     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1571   }
1572   /* backward solve the upper triangular */
1573   for (i=n-1; i>=0; i--){
1574     v    = aa + 25*diag[i] + 25;
1575     vi   = aj + diag[i] + 1;
1576     nz   = ai[i+1] - diag[i] - 1;
1577     idt  = 5*i;
1578     s1 = t[idt];  s2 = t[1+idt];
1579     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1580     while (nz--) {
1581       idx   = 5*(*vi++);
1582       x1    = t[idx];   x2 = t[1+idx];
1583       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1585       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1586       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1587       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1588       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1589       v += 25;
1590     }
1591     idc = 5*(*c--);
1592     v   = aa + 25*diag[i];
1593     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1594                                  v[15]*s4+v[20]*s5;
1595     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1596                                  v[16]*s4+v[21]*s5;
1597     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1598                                  v[17]*s4+v[22]*s5;
1599     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1600                                  v[18]*s4+v[23]*s5;
1601     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1602                                  v[19]*s4+v[24]*s5;
1603   }
1604 
1605   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1606   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1607   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1608   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1609   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 #undef __FUNCT__
1614 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1615 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1616 {
1617   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1618   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1619   PetscErrorCode ierr;
1620   int *diag = a->diag,jdx;
1621   MatScalar       *aa=a->a,*v;
1622   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1623 
1624   PetscFunctionBegin;
1625   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1627   /* forward solve the lower triangular */
1628   idx    = 0;
1629   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1630   for (i=1; i<n; i++) {
1631     v     =  aa + 25*ai[i];
1632     vi    =  aj + ai[i];
1633     nz    =  diag[i] - ai[i];
1634     idx   =  5*i;
1635     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1636     while (nz--) {
1637       jdx   = 5*(*vi++);
1638       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1639       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1640       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1641       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1642       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1643       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1644       v    += 25;
1645     }
1646     x[idx]   = s1;
1647     x[1+idx] = s2;
1648     x[2+idx] = s3;
1649     x[3+idx] = s4;
1650     x[4+idx] = s5;
1651   }
1652   /* backward solve the upper triangular */
1653   for (i=n-1; i>=0; i--){
1654     v    = aa + 25*diag[i] + 25;
1655     vi   = aj + diag[i] + 1;
1656     nz   = ai[i+1] - diag[i] - 1;
1657     idt  = 5*i;
1658     s1 = x[idt];  s2 = x[1+idt];
1659     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1660     while (nz--) {
1661       idx   = 5*(*vi++);
1662       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1663       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1664       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1665       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1666       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1667       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1668       v    += 25;
1669     }
1670     v        = aa + 25*diag[i];
1671     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1672     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1673     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1674     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1675     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1676   }
1677 
1678   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1679   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1680   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 #undef __FUNCT__
1685 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1686 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1687 {
1688   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1689   IS              iscol=a->col,isrow=a->row;
1690   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1691   int             *diag = a->diag;
1692   MatScalar       *aa=a->a,*v;
1693   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1694 
1695   PetscFunctionBegin;
1696   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1697   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1698   t  = a->solve_work;
1699 
1700   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1701   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1702 
1703   /* forward solve the lower triangular */
1704   idx    = 4*(*r++);
1705   t[0] = b[idx];   t[1] = b[1+idx];
1706   t[2] = b[2+idx]; t[3] = b[3+idx];
1707   for (i=1; i<n; i++) {
1708     v     = aa + 16*ai[i];
1709     vi    = aj + ai[i];
1710     nz    = diag[i] - ai[i];
1711     idx   = 4*(*r++);
1712     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1713     while (nz--) {
1714       idx   = 4*(*vi++);
1715       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1716       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1717       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1718       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1719       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1720       v    += 16;
1721     }
1722     idx        = 4*i;
1723     t[idx]   = s1;t[1+idx] = s2;
1724     t[2+idx] = s3;t[3+idx] = s4;
1725   }
1726   /* backward solve the upper triangular */
1727   for (i=n-1; i>=0; i--){
1728     v    = aa + 16*diag[i] + 16;
1729     vi   = aj + diag[i] + 1;
1730     nz   = ai[i+1] - diag[i] - 1;
1731     idt  = 4*i;
1732     s1 = t[idt];  s2 = t[1+idt];
1733     s3 = t[2+idt];s4 = t[3+idt];
1734     while (nz--) {
1735       idx   = 4*(*vi++);
1736       x1    = t[idx];   x2 = t[1+idx];
1737       x3    = t[2+idx]; x4 = t[3+idx];
1738       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1739       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1740       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1741       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1742       v += 16;
1743     }
1744     idc      = 4*(*c--);
1745     v        = aa + 16*diag[i];
1746     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1747     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1748     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1749     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1750   }
1751 
1752   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1753   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1754   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1755   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1756   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 #undef __FUNCT__
1761 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1762 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1763 {
1764   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1765   IS              iscol=a->col,isrow=a->row;
1766   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1767   int             *diag = a->diag;
1768   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1769   PetscScalar     *x,*b;
1770 
1771   PetscFunctionBegin;
1772   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1773   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1774   t  = (MatScalar *)a->solve_work;
1775 
1776   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1777   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1778 
1779   /* forward solve the lower triangular */
1780   idx    = 4*(*r++);
1781   t[0] = (MatScalar)b[idx];
1782   t[1] = (MatScalar)b[1+idx];
1783   t[2] = (MatScalar)b[2+idx];
1784   t[3] = (MatScalar)b[3+idx];
1785   for (i=1; i<n; i++) {
1786     v     = aa + 16*ai[i];
1787     vi    = aj + ai[i];
1788     nz    = diag[i] - ai[i];
1789     idx   = 4*(*r++);
1790     s1 = (MatScalar)b[idx];
1791     s2 = (MatScalar)b[1+idx];
1792     s3 = (MatScalar)b[2+idx];
1793     s4 = (MatScalar)b[3+idx];
1794     while (nz--) {
1795       idx   = 4*(*vi++);
1796       x1  = t[idx];
1797       x2  = t[1+idx];
1798       x3  = t[2+idx];
1799       x4  = t[3+idx];
1800       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1801       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1802       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1803       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1804       v    += 16;
1805     }
1806     idx        = 4*i;
1807     t[idx]   = s1;
1808     t[1+idx] = s2;
1809     t[2+idx] = s3;
1810     t[3+idx] = s4;
1811   }
1812   /* backward solve the upper triangular */
1813   for (i=n-1; i>=0; i--){
1814     v    = aa + 16*diag[i] + 16;
1815     vi   = aj + diag[i] + 1;
1816     nz   = ai[i+1] - diag[i] - 1;
1817     idt  = 4*i;
1818     s1 = t[idt];
1819     s2 = t[1+idt];
1820     s3 = t[2+idt];
1821     s4 = t[3+idt];
1822     while (nz--) {
1823       idx   = 4*(*vi++);
1824       x1  = t[idx];
1825       x2  = t[1+idx];
1826       x3  = t[2+idx];
1827       x4  = t[3+idx];
1828       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1829       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1830       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1831       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1832       v += 16;
1833     }
1834     idc      = 4*(*c--);
1835     v        = aa + 16*diag[i];
1836     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1837     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1838     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1839     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1840     x[idc]   = (PetscScalar)t[idt];
1841     x[1+idc] = (PetscScalar)t[1+idt];
1842     x[2+idc] = (PetscScalar)t[2+idt];
1843     x[3+idc] = (PetscScalar)t[3+idt];
1844  }
1845 
1846   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1847   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1848   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1849   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1850   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 #if defined (PETSC_HAVE_SSE)
1855 
1856 #include PETSC_HAVE_SSE
1857 
1858 #undef __FUNCT__
1859 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1860 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1861 {
1862   /*
1863      Note: This code uses demotion of double
1864      to float when performing the mixed-mode computation.
1865      This may not be numerically reasonable for all applications.
1866   */
1867   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1868   IS              iscol=a->col,isrow=a->row;
1869   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1870   int             *diag = a->diag,ai16;
1871   MatScalar       *aa=a->a,*v;
1872   PetscScalar     *x,*b,*t;
1873 
1874   /* Make space in temp stack for 16 Byte Aligned arrays */
1875   float           ssealignedspace[11],*tmps,*tmpx;
1876   unsigned long   offset;
1877 
1878   PetscFunctionBegin;
1879   SSE_SCOPE_BEGIN;
1880 
1881     offset = (unsigned long)ssealignedspace % 16;
1882     if (offset) offset = (16 - offset)/4;
1883     tmps = &ssealignedspace[offset];
1884     tmpx = &ssealignedspace[offset+4];
1885     PREFETCH_NTA(aa+16*ai[1]);
1886 
1887     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1888     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1889     t  = a->solve_work;
1890 
1891     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1892     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1893 
1894     /* forward solve the lower triangular */
1895     idx  = 4*(*r++);
1896     t[0] = b[idx];   t[1] = b[1+idx];
1897     t[2] = b[2+idx]; t[3] = b[3+idx];
1898     v    =  aa + 16*ai[1];
1899 
1900     for (i=1; i<n;) {
1901       PREFETCH_NTA(&v[8]);
1902       vi   =  aj      + ai[i];
1903       nz   =  diag[i] - ai[i];
1904       idx  =  4*(*r++);
1905 
1906       /* Demote sum from double to float */
1907       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1908       LOAD_PS(tmps,XMM7);
1909 
1910       while (nz--) {
1911         PREFETCH_NTA(&v[16]);
1912         idx = 4*(*vi++);
1913 
1914         /* Demote solution (so far) from double to float */
1915         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1916 
1917         /* 4x4 Matrix-Vector product with negative accumulation: */
1918         SSE_INLINE_BEGIN_2(tmpx,v)
1919           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1920 
1921           /* First Column */
1922           SSE_COPY_PS(XMM0,XMM6)
1923           SSE_SHUFFLE(XMM0,XMM0,0x00)
1924           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1925           SSE_SUB_PS(XMM7,XMM0)
1926 
1927           /* Second Column */
1928           SSE_COPY_PS(XMM1,XMM6)
1929           SSE_SHUFFLE(XMM1,XMM1,0x55)
1930           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1931           SSE_SUB_PS(XMM7,XMM1)
1932 
1933           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1934 
1935           /* Third Column */
1936           SSE_COPY_PS(XMM2,XMM6)
1937           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1938           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1939           SSE_SUB_PS(XMM7,XMM2)
1940 
1941           /* Fourth Column */
1942           SSE_COPY_PS(XMM3,XMM6)
1943           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1944           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1945           SSE_SUB_PS(XMM7,XMM3)
1946         SSE_INLINE_END_2
1947 
1948         v  += 16;
1949       }
1950       idx = 4*i;
1951       v   = aa + 16*ai[++i];
1952       PREFETCH_NTA(v);
1953       STORE_PS(tmps,XMM7);
1954 
1955       /* Promote result from float to double */
1956       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1957     }
1958     /* backward solve the upper triangular */
1959     idt  = 4*(n-1);
1960     ai16 = 16*diag[n-1];
1961     v    = aa + ai16 + 16;
1962     for (i=n-1; i>=0;){
1963       PREFETCH_NTA(&v[8]);
1964       vi = aj + diag[i] + 1;
1965       nz = ai[i+1] - diag[i] - 1;
1966 
1967       /* Demote accumulator from double to float */
1968       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1969       LOAD_PS(tmps,XMM7);
1970 
1971       while (nz--) {
1972         PREFETCH_NTA(&v[16]);
1973         idx = 4*(*vi++);
1974 
1975         /* Demote solution (so far) from double to float */
1976         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1977 
1978         /* 4x4 Matrix-Vector Product with negative accumulation: */
1979         SSE_INLINE_BEGIN_2(tmpx,v)
1980           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1981 
1982           /* First Column */
1983           SSE_COPY_PS(XMM0,XMM6)
1984           SSE_SHUFFLE(XMM0,XMM0,0x00)
1985           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1986           SSE_SUB_PS(XMM7,XMM0)
1987 
1988           /* Second Column */
1989           SSE_COPY_PS(XMM1,XMM6)
1990           SSE_SHUFFLE(XMM1,XMM1,0x55)
1991           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1992           SSE_SUB_PS(XMM7,XMM1)
1993 
1994           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1995 
1996           /* Third Column */
1997           SSE_COPY_PS(XMM2,XMM6)
1998           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1999           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2000           SSE_SUB_PS(XMM7,XMM2)
2001 
2002           /* Fourth Column */
2003           SSE_COPY_PS(XMM3,XMM6)
2004           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2005           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2006           SSE_SUB_PS(XMM7,XMM3)
2007         SSE_INLINE_END_2
2008         v  += 16;
2009       }
2010       v    = aa + ai16;
2011       ai16 = 16*diag[--i];
2012       PREFETCH_NTA(aa+ai16+16);
2013       /*
2014          Scale the result by the diagonal 4x4 block,
2015          which was inverted as part of the factorization
2016       */
2017       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2018         /* First Column */
2019         SSE_COPY_PS(XMM0,XMM7)
2020         SSE_SHUFFLE(XMM0,XMM0,0x00)
2021         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2022 
2023         /* Second Column */
2024         SSE_COPY_PS(XMM1,XMM7)
2025         SSE_SHUFFLE(XMM1,XMM1,0x55)
2026         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2027         SSE_ADD_PS(XMM0,XMM1)
2028 
2029         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2030 
2031         /* Third Column */
2032         SSE_COPY_PS(XMM2,XMM7)
2033         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2034         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2035         SSE_ADD_PS(XMM0,XMM2)
2036 
2037         /* Fourth Column */
2038         SSE_COPY_PS(XMM3,XMM7)
2039         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2040         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2041         SSE_ADD_PS(XMM0,XMM3)
2042 
2043         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2044       SSE_INLINE_END_3
2045 
2046       /* Promote solution from float to double */
2047       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2048 
2049       /* Apply reordering to t and stream into x.    */
2050       /* This way, x doesn't pollute the cache.      */
2051       /* Be careful with size: 2 doubles = 4 floats! */
2052       idc  = 4*(*c--);
2053       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2054         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2055         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2056         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2057         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2058         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2059         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2060       SSE_INLINE_END_2
2061       v    = aa + ai16 + 16;
2062       idt -= 4;
2063     }
2064 
2065     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2066     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2067     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2068     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2069     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2070   SSE_SCOPE_END;
2071   PetscFunctionReturn(0);
2072 }
2073 
2074 #endif
2075 
2076 
2077 /*
2078       Special case where the matrix was ILU(0) factored in the natural
2079    ordering. This eliminates the need for the column and row permutation.
2080 */
2081 #undef __FUNCT__
2082 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2083 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2084 {
2085   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2086   int             n=a->mbs,*ai=a->i,*aj=a->j;
2087   PetscErrorCode ierr;
2088   int  *diag = a->diag;
2089   MatScalar       *aa=a->a;
2090   PetscScalar     *x,*b;
2091 
2092   PetscFunctionBegin;
2093   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2094   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2095 
2096 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2097   {
2098     static PetscScalar w[2000]; /* very BAD need to fix */
2099     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2100   }
2101 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2102   {
2103     static PetscScalar w[2000]; /* very BAD need to fix */
2104     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2105   }
2106 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2107   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2108 #else
2109   {
2110     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2111     MatScalar    *v;
2112     int          jdx,idt,idx,nz,*vi,i,ai16;
2113 
2114   /* forward solve the lower triangular */
2115   idx    = 0;
2116   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2117   for (i=1; i<n; i++) {
2118     v     =  aa      + 16*ai[i];
2119     vi    =  aj      + ai[i];
2120     nz    =  diag[i] - ai[i];
2121     idx   +=  4;
2122     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2123     while (nz--) {
2124       jdx   = 4*(*vi++);
2125       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2126       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2127       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2128       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2129       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2130       v    += 16;
2131     }
2132     x[idx]   = s1;
2133     x[1+idx] = s2;
2134     x[2+idx] = s3;
2135     x[3+idx] = s4;
2136   }
2137   /* backward solve the upper triangular */
2138   idt = 4*(n-1);
2139   for (i=n-1; i>=0; i--){
2140     ai16 = 16*diag[i];
2141     v    = aa + ai16 + 16;
2142     vi   = aj + diag[i] + 1;
2143     nz   = ai[i+1] - diag[i] - 1;
2144     s1 = x[idt];  s2 = x[1+idt];
2145     s3 = x[2+idt];s4 = x[3+idt];
2146     while (nz--) {
2147       idx   = 4*(*vi++);
2148       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2149       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2150       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2151       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2152       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2153       v    += 16;
2154     }
2155     v        = aa + ai16;
2156     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2157     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2158     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2159     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2160     idt -= 4;
2161   }
2162   }
2163 #endif
2164 
2165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2167   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2168   PetscFunctionReturn(0);
2169 }
2170 
2171 #undef __FUNCT__
2172 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2173 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2174 {
2175   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2176   int             n=a->mbs,*ai=a->i,*aj=a->j;
2177   PetscErrorCode ierr;
2178   int  *diag = a->diag;
2179   MatScalar       *aa=a->a;
2180   PetscScalar     *x,*b;
2181 
2182   PetscFunctionBegin;
2183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2185 
2186   {
2187     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2188     MatScalar  *v,*t=(MatScalar *)x;
2189     int        jdx,idt,idx,nz,*vi,i,ai16;
2190 
2191     /* forward solve the lower triangular */
2192     idx  = 0;
2193     t[0] = (MatScalar)b[0];
2194     t[1] = (MatScalar)b[1];
2195     t[2] = (MatScalar)b[2];
2196     t[3] = (MatScalar)b[3];
2197     for (i=1; i<n; i++) {
2198       v     =  aa      + 16*ai[i];
2199       vi    =  aj      + ai[i];
2200       nz    =  diag[i] - ai[i];
2201       idx   +=  4;
2202       s1 = (MatScalar)b[idx];
2203       s2 = (MatScalar)b[1+idx];
2204       s3 = (MatScalar)b[2+idx];
2205       s4 = (MatScalar)b[3+idx];
2206       while (nz--) {
2207         jdx = 4*(*vi++);
2208         x1  = t[jdx];
2209         x2  = t[1+jdx];
2210         x3  = t[2+jdx];
2211         x4  = t[3+jdx];
2212         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2213         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2214         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2215         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2216         v    += 16;
2217       }
2218       t[idx]   = s1;
2219       t[1+idx] = s2;
2220       t[2+idx] = s3;
2221       t[3+idx] = s4;
2222     }
2223     /* backward solve the upper triangular */
2224     idt = 4*(n-1);
2225     for (i=n-1; i>=0; i--){
2226       ai16 = 16*diag[i];
2227       v    = aa + ai16 + 16;
2228       vi   = aj + diag[i] + 1;
2229       nz   = ai[i+1] - diag[i] - 1;
2230       s1   = t[idt];
2231       s2   = t[1+idt];
2232       s3   = t[2+idt];
2233       s4   = t[3+idt];
2234       while (nz--) {
2235         idx = 4*(*vi++);
2236         x1  = (MatScalar)x[idx];
2237         x2  = (MatScalar)x[1+idx];
2238         x3  = (MatScalar)x[2+idx];
2239         x4  = (MatScalar)x[3+idx];
2240         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2241         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2242         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2243         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2244         v    += 16;
2245       }
2246       v        = aa + ai16;
2247       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2248       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2249       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2250       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2251       idt -= 4;
2252     }
2253   }
2254 
2255   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2256   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2257   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2258   PetscFunctionReturn(0);
2259 }
2260 
2261 #if defined (PETSC_HAVE_SSE)
2262 
2263 #include PETSC_HAVE_SSE
2264 #undef __FUNCT__
2265 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2266 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2267 {
2268   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2269   unsigned short *aj=(unsigned short *)a->j;
2270   PetscErrorCode ierr;
2271   int  *ai=a->i,n=a->mbs,*diag = a->diag;
2272   MatScalar      *aa=a->a;
2273   PetscScalar    *x,*b;
2274 
2275   PetscFunctionBegin;
2276   SSE_SCOPE_BEGIN;
2277   /*
2278      Note: This code currently uses demotion of double
2279      to float when performing the mixed-mode computation.
2280      This may not be numerically reasonable for all applications.
2281   */
2282   PREFETCH_NTA(aa+16*ai[1]);
2283 
2284   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2285   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2286   {
2287     /* x will first be computed in single precision then promoted inplace to double */
2288     MatScalar      *v,*t=(MatScalar *)x;
2289     int            nz,i,idt,ai16;
2290     unsigned int   jdx,idx;
2291     unsigned short *vi;
2292     /* Forward solve the lower triangular factor. */
2293 
2294     /* First block is the identity. */
2295     idx  = 0;
2296     CONVERT_DOUBLE4_FLOAT4(t,b);
2297     v    =  aa + 16*((unsigned int)ai[1]);
2298 
2299     for (i=1; i<n;) {
2300       PREFETCH_NTA(&v[8]);
2301       vi   =  aj      + ai[i];
2302       nz   =  diag[i] - ai[i];
2303       idx +=  4;
2304 
2305       /* Demote RHS from double to float. */
2306       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2307       LOAD_PS(&t[idx],XMM7);
2308 
2309       while (nz--) {
2310         PREFETCH_NTA(&v[16]);
2311         jdx = 4*((unsigned int)(*vi++));
2312 
2313         /* 4x4 Matrix-Vector product with negative accumulation: */
2314         SSE_INLINE_BEGIN_2(&t[jdx],v)
2315           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2316 
2317           /* First Column */
2318           SSE_COPY_PS(XMM0,XMM6)
2319           SSE_SHUFFLE(XMM0,XMM0,0x00)
2320           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2321           SSE_SUB_PS(XMM7,XMM0)
2322 
2323           /* Second Column */
2324           SSE_COPY_PS(XMM1,XMM6)
2325           SSE_SHUFFLE(XMM1,XMM1,0x55)
2326           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2327           SSE_SUB_PS(XMM7,XMM1)
2328 
2329           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2330 
2331           /* Third Column */
2332           SSE_COPY_PS(XMM2,XMM6)
2333           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2334           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2335           SSE_SUB_PS(XMM7,XMM2)
2336 
2337           /* Fourth Column */
2338           SSE_COPY_PS(XMM3,XMM6)
2339           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2340           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2341           SSE_SUB_PS(XMM7,XMM3)
2342         SSE_INLINE_END_2
2343 
2344         v  += 16;
2345       }
2346       v    =  aa + 16*ai[++i];
2347       PREFETCH_NTA(v);
2348       STORE_PS(&t[idx],XMM7);
2349     }
2350 
2351     /* Backward solve the upper triangular factor.*/
2352 
2353     idt  = 4*(n-1);
2354     ai16 = 16*diag[n-1];
2355     v    = aa + ai16 + 16;
2356     for (i=n-1; i>=0;){
2357       PREFETCH_NTA(&v[8]);
2358       vi = aj + diag[i] + 1;
2359       nz = ai[i+1] - diag[i] - 1;
2360 
2361       LOAD_PS(&t[idt],XMM7);
2362 
2363       while (nz--) {
2364         PREFETCH_NTA(&v[16]);
2365         idx = 4*((unsigned int)(*vi++));
2366 
2367         /* 4x4 Matrix-Vector Product with negative accumulation: */
2368         SSE_INLINE_BEGIN_2(&t[idx],v)
2369           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2370 
2371           /* First Column */
2372           SSE_COPY_PS(XMM0,XMM6)
2373           SSE_SHUFFLE(XMM0,XMM0,0x00)
2374           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2375           SSE_SUB_PS(XMM7,XMM0)
2376 
2377           /* Second Column */
2378           SSE_COPY_PS(XMM1,XMM6)
2379           SSE_SHUFFLE(XMM1,XMM1,0x55)
2380           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2381           SSE_SUB_PS(XMM7,XMM1)
2382 
2383           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2384 
2385           /* Third Column */
2386           SSE_COPY_PS(XMM2,XMM6)
2387           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2388           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2389           SSE_SUB_PS(XMM7,XMM2)
2390 
2391           /* Fourth Column */
2392           SSE_COPY_PS(XMM3,XMM6)
2393           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2394           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2395           SSE_SUB_PS(XMM7,XMM3)
2396         SSE_INLINE_END_2
2397         v  += 16;
2398       }
2399       v    = aa + ai16;
2400       ai16 = 16*diag[--i];
2401       PREFETCH_NTA(aa+ai16+16);
2402       /*
2403          Scale the result by the diagonal 4x4 block,
2404          which was inverted as part of the factorization
2405       */
2406       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2407         /* First Column */
2408         SSE_COPY_PS(XMM0,XMM7)
2409         SSE_SHUFFLE(XMM0,XMM0,0x00)
2410         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2411 
2412         /* Second Column */
2413         SSE_COPY_PS(XMM1,XMM7)
2414         SSE_SHUFFLE(XMM1,XMM1,0x55)
2415         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2416         SSE_ADD_PS(XMM0,XMM1)
2417 
2418         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2419 
2420         /* Third Column */
2421         SSE_COPY_PS(XMM2,XMM7)
2422         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2423         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2424         SSE_ADD_PS(XMM0,XMM2)
2425 
2426         /* Fourth Column */
2427         SSE_COPY_PS(XMM3,XMM7)
2428         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2429         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2430         SSE_ADD_PS(XMM0,XMM3)
2431 
2432         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2433       SSE_INLINE_END_3
2434 
2435       v    = aa + ai16 + 16;
2436       idt -= 4;
2437     }
2438 
2439     /* Convert t from single precision back to double precision (inplace)*/
2440     idt = 4*(n-1);
2441     for (i=n-1;i>=0;i--) {
2442       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2443       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2444       PetscScalar *xtemp=&x[idt];
2445       MatScalar   *ttemp=&t[idt];
2446       xtemp[3] = (PetscScalar)ttemp[3];
2447       xtemp[2] = (PetscScalar)ttemp[2];
2448       xtemp[1] = (PetscScalar)ttemp[1];
2449       xtemp[0] = (PetscScalar)ttemp[0];
2450       idt -= 4;
2451     }
2452 
2453   } /* End of artificial scope. */
2454   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2456   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2457   SSE_SCOPE_END;
2458   PetscFunctionReturn(0);
2459 }
2460 
2461 #undef __FUNCT__
2462 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2463 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2464 {
2465   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2466   int            *aj=a->j;
2467   PetscErrorCode ierr;
2468   int *ai=a->i,n=a->mbs,*diag = a->diag;
2469   MatScalar      *aa=a->a;
2470   PetscScalar    *x,*b;
2471 
2472   PetscFunctionBegin;
2473   SSE_SCOPE_BEGIN;
2474   /*
2475      Note: This code currently uses demotion of double
2476      to float when performing the mixed-mode computation.
2477      This may not be numerically reasonable for all applications.
2478   */
2479   PREFETCH_NTA(aa+16*ai[1]);
2480 
2481   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2482   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2483   {
2484     /* x will first be computed in single precision then promoted inplace to double */
2485     MatScalar *v,*t=(MatScalar *)x;
2486     int       nz,i,idt,ai16;
2487     int       jdx,idx;
2488     int       *vi;
2489     /* Forward solve the lower triangular factor. */
2490 
2491     /* First block is the identity. */
2492     idx  = 0;
2493     CONVERT_DOUBLE4_FLOAT4(t,b);
2494     v    =  aa + 16*ai[1];
2495 
2496     for (i=1; i<n;) {
2497       PREFETCH_NTA(&v[8]);
2498       vi   =  aj      + ai[i];
2499       nz   =  diag[i] - ai[i];
2500       idx +=  4;
2501 
2502       /* Demote RHS from double to float. */
2503       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2504       LOAD_PS(&t[idx],XMM7);
2505 
2506       while (nz--) {
2507         PREFETCH_NTA(&v[16]);
2508         jdx = 4*(*vi++);
2509 /*          jdx = *vi++; */
2510 
2511         /* 4x4 Matrix-Vector product with negative accumulation: */
2512         SSE_INLINE_BEGIN_2(&t[jdx],v)
2513           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2514 
2515           /* First Column */
2516           SSE_COPY_PS(XMM0,XMM6)
2517           SSE_SHUFFLE(XMM0,XMM0,0x00)
2518           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2519           SSE_SUB_PS(XMM7,XMM0)
2520 
2521           /* Second Column */
2522           SSE_COPY_PS(XMM1,XMM6)
2523           SSE_SHUFFLE(XMM1,XMM1,0x55)
2524           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2525           SSE_SUB_PS(XMM7,XMM1)
2526 
2527           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2528 
2529           /* Third Column */
2530           SSE_COPY_PS(XMM2,XMM6)
2531           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2532           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2533           SSE_SUB_PS(XMM7,XMM2)
2534 
2535           /* Fourth Column */
2536           SSE_COPY_PS(XMM3,XMM6)
2537           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2538           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2539           SSE_SUB_PS(XMM7,XMM3)
2540         SSE_INLINE_END_2
2541 
2542         v  += 16;
2543       }
2544       v    =  aa + 16*ai[++i];
2545       PREFETCH_NTA(v);
2546       STORE_PS(&t[idx],XMM7);
2547     }
2548 
2549     /* Backward solve the upper triangular factor.*/
2550 
2551     idt  = 4*(n-1);
2552     ai16 = 16*diag[n-1];
2553     v    = aa + ai16 + 16;
2554     for (i=n-1; i>=0;){
2555       PREFETCH_NTA(&v[8]);
2556       vi = aj + diag[i] + 1;
2557       nz = ai[i+1] - diag[i] - 1;
2558 
2559       LOAD_PS(&t[idt],XMM7);
2560 
2561       while (nz--) {
2562         PREFETCH_NTA(&v[16]);
2563         idx = 4*(*vi++);
2564 /*          idx = *vi++; */
2565 
2566         /* 4x4 Matrix-Vector Product with negative accumulation: */
2567         SSE_INLINE_BEGIN_2(&t[idx],v)
2568           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2569 
2570           /* First Column */
2571           SSE_COPY_PS(XMM0,XMM6)
2572           SSE_SHUFFLE(XMM0,XMM0,0x00)
2573           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2574           SSE_SUB_PS(XMM7,XMM0)
2575 
2576           /* Second Column */
2577           SSE_COPY_PS(XMM1,XMM6)
2578           SSE_SHUFFLE(XMM1,XMM1,0x55)
2579           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2580           SSE_SUB_PS(XMM7,XMM1)
2581 
2582           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2583 
2584           /* Third Column */
2585           SSE_COPY_PS(XMM2,XMM6)
2586           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2587           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2588           SSE_SUB_PS(XMM7,XMM2)
2589 
2590           /* Fourth Column */
2591           SSE_COPY_PS(XMM3,XMM6)
2592           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2593           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2594           SSE_SUB_PS(XMM7,XMM3)
2595         SSE_INLINE_END_2
2596         v  += 16;
2597       }
2598       v    = aa + ai16;
2599       ai16 = 16*diag[--i];
2600       PREFETCH_NTA(aa+ai16+16);
2601       /*
2602          Scale the result by the diagonal 4x4 block,
2603          which was inverted as part of the factorization
2604       */
2605       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2606         /* First Column */
2607         SSE_COPY_PS(XMM0,XMM7)
2608         SSE_SHUFFLE(XMM0,XMM0,0x00)
2609         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2610 
2611         /* Second Column */
2612         SSE_COPY_PS(XMM1,XMM7)
2613         SSE_SHUFFLE(XMM1,XMM1,0x55)
2614         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2615         SSE_ADD_PS(XMM0,XMM1)
2616 
2617         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2618 
2619         /* Third Column */
2620         SSE_COPY_PS(XMM2,XMM7)
2621         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2622         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2623         SSE_ADD_PS(XMM0,XMM2)
2624 
2625         /* Fourth Column */
2626         SSE_COPY_PS(XMM3,XMM7)
2627         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2628         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2629         SSE_ADD_PS(XMM0,XMM3)
2630 
2631         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2632       SSE_INLINE_END_3
2633 
2634       v    = aa + ai16 + 16;
2635       idt -= 4;
2636     }
2637 
2638     /* Convert t from single precision back to double precision (inplace)*/
2639     idt = 4*(n-1);
2640     for (i=n-1;i>=0;i--) {
2641       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2642       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2643       PetscScalar *xtemp=&x[idt];
2644       MatScalar   *ttemp=&t[idt];
2645       xtemp[3] = (PetscScalar)ttemp[3];
2646       xtemp[2] = (PetscScalar)ttemp[2];
2647       xtemp[1] = (PetscScalar)ttemp[1];
2648       xtemp[0] = (PetscScalar)ttemp[0];
2649       idt -= 4;
2650     }
2651 
2652   } /* End of artificial scope. */
2653   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2654   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2655   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2656   SSE_SCOPE_END;
2657   PetscFunctionReturn(0);
2658 }
2659 
2660 #endif
2661 #undef __FUNCT__
2662 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2663 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2664 {
2665   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2666   IS              iscol=a->col,isrow=a->row;
2667   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2668   int             *diag = a->diag;
2669   MatScalar       *aa=a->a,*v;
2670   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2671 
2672   PetscFunctionBegin;
2673   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2674   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2675   t  = a->solve_work;
2676 
2677   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2678   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2679 
2680   /* forward solve the lower triangular */
2681   idx    = 3*(*r++);
2682   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2683   for (i=1; i<n; i++) {
2684     v     = aa + 9*ai[i];
2685     vi    = aj + ai[i];
2686     nz    = diag[i] - ai[i];
2687     idx   = 3*(*r++);
2688     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2689     while (nz--) {
2690       idx   = 3*(*vi++);
2691       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2692       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2693       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2694       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2695       v += 9;
2696     }
2697     idx = 3*i;
2698     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2699   }
2700   /* backward solve the upper triangular */
2701   for (i=n-1; i>=0; i--){
2702     v    = aa + 9*diag[i] + 9;
2703     vi   = aj + diag[i] + 1;
2704     nz   = ai[i+1] - diag[i] - 1;
2705     idt  = 3*i;
2706     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2707     while (nz--) {
2708       idx   = 3*(*vi++);
2709       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2710       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2711       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2712       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2713       v += 9;
2714     }
2715     idc = 3*(*c--);
2716     v   = aa + 9*diag[i];
2717     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2718     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2719     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2720   }
2721   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2722   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2723   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2724   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2725   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2726   PetscFunctionReturn(0);
2727 }
2728 
2729 /*
2730       Special case where the matrix was ILU(0) factored in the natural
2731    ordering. This eliminates the need for the column and row permutation.
2732 */
2733 #undef __FUNCT__
2734 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2735 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2736 {
2737   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2738   int             n=a->mbs,*ai=a->i,*aj=a->j;
2739   PetscErrorCode ierr;
2740   int *diag = a->diag;
2741   MatScalar       *aa=a->a,*v;
2742   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2743   int             jdx,idt,idx,nz,*vi,i;
2744 
2745   PetscFunctionBegin;
2746   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2747   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2748 
2749 
2750   /* forward solve the lower triangular */
2751   idx    = 0;
2752   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2753   for (i=1; i<n; i++) {
2754     v     =  aa      + 9*ai[i];
2755     vi    =  aj      + ai[i];
2756     nz    =  diag[i] - ai[i];
2757     idx   +=  3;
2758     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2759     while (nz--) {
2760       jdx   = 3*(*vi++);
2761       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2762       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2763       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2764       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2765       v    += 9;
2766     }
2767     x[idx]   = s1;
2768     x[1+idx] = s2;
2769     x[2+idx] = s3;
2770   }
2771   /* backward solve the upper triangular */
2772   for (i=n-1; i>=0; i--){
2773     v    = aa + 9*diag[i] + 9;
2774     vi   = aj + diag[i] + 1;
2775     nz   = ai[i+1] - diag[i] - 1;
2776     idt  = 3*i;
2777     s1 = x[idt];  s2 = x[1+idt];
2778     s3 = x[2+idt];
2779     while (nz--) {
2780       idx   = 3*(*vi++);
2781       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2782       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2783       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2784       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2785       v    += 9;
2786     }
2787     v        = aa +  9*diag[i];
2788     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2789     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2790     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2791   }
2792 
2793   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2794   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2795   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2796   PetscFunctionReturn(0);
2797 }
2798 
2799 #undef __FUNCT__
2800 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2801 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2802 {
2803   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2804   IS              iscol=a->col,isrow=a->row;
2805   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2806   int             *diag = a->diag;
2807   MatScalar       *aa=a->a,*v;
2808   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2809 
2810   PetscFunctionBegin;
2811   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2812   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2813   t  = a->solve_work;
2814 
2815   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2816   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2817 
2818   /* forward solve the lower triangular */
2819   idx    = 2*(*r++);
2820   t[0] = b[idx]; t[1] = b[1+idx];
2821   for (i=1; i<n; i++) {
2822     v     = aa + 4*ai[i];
2823     vi    = aj + ai[i];
2824     nz    = diag[i] - ai[i];
2825     idx   = 2*(*r++);
2826     s1  = b[idx]; s2 = b[1+idx];
2827     while (nz--) {
2828       idx   = 2*(*vi++);
2829       x1    = t[idx]; x2 = t[1+idx];
2830       s1 -= v[0]*x1 + v[2]*x2;
2831       s2 -= v[1]*x1 + v[3]*x2;
2832       v += 4;
2833     }
2834     idx = 2*i;
2835     t[idx] = s1; t[1+idx] = s2;
2836   }
2837   /* backward solve the upper triangular */
2838   for (i=n-1; i>=0; i--){
2839     v    = aa + 4*diag[i] + 4;
2840     vi   = aj + diag[i] + 1;
2841     nz   = ai[i+1] - diag[i] - 1;
2842     idt  = 2*i;
2843     s1 = t[idt]; s2 = t[1+idt];
2844     while (nz--) {
2845       idx   = 2*(*vi++);
2846       x1    = t[idx]; x2 = t[1+idx];
2847       s1 -= v[0]*x1 + v[2]*x2;
2848       s2 -= v[1]*x1 + v[3]*x2;
2849       v += 4;
2850     }
2851     idc = 2*(*c--);
2852     v   = aa + 4*diag[i];
2853     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2854     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2855   }
2856   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2857   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2858   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2859   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2860   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2861   PetscFunctionReturn(0);
2862 }
2863 
2864 /*
2865       Special case where the matrix was ILU(0) factored in the natural
2866    ordering. This eliminates the need for the column and row permutation.
2867 */
2868 #undef __FUNCT__
2869 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2870 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2871 {
2872   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2873   int             n=a->mbs,*ai=a->i,*aj=a->j;
2874   PetscErrorCode ierr;
2875   int  *diag = a->diag;
2876   MatScalar       *aa=a->a,*v;
2877   PetscScalar     *x,*b,s1,s2,x1,x2;
2878   int             jdx,idt,idx,nz,*vi,i;
2879 
2880   PetscFunctionBegin;
2881   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2882   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2883 
2884   /* forward solve the lower triangular */
2885   idx    = 0;
2886   x[0]   = b[0]; x[1] = b[1];
2887   for (i=1; i<n; i++) {
2888     v     =  aa      + 4*ai[i];
2889     vi    =  aj      + ai[i];
2890     nz    =  diag[i] - ai[i];
2891     idx   +=  2;
2892     s1  =  b[idx];s2 = b[1+idx];
2893     while (nz--) {
2894       jdx   = 2*(*vi++);
2895       x1    = x[jdx];x2 = x[1+jdx];
2896       s1 -= v[0]*x1 + v[2]*x2;
2897       s2 -= v[1]*x1 + v[3]*x2;
2898       v    += 4;
2899     }
2900     x[idx]   = s1;
2901     x[1+idx] = s2;
2902   }
2903   /* backward solve the upper triangular */
2904   for (i=n-1; i>=0; i--){
2905     v    = aa + 4*diag[i] + 4;
2906     vi   = aj + diag[i] + 1;
2907     nz   = ai[i+1] - diag[i] - 1;
2908     idt  = 2*i;
2909     s1 = x[idt];  s2 = x[1+idt];
2910     while (nz--) {
2911       idx   = 2*(*vi++);
2912       x1    = x[idx];   x2 = x[1+idx];
2913       s1 -= v[0]*x1 + v[2]*x2;
2914       s2 -= v[1]*x1 + v[3]*x2;
2915       v    += 4;
2916     }
2917     v        = aa +  4*diag[i];
2918     x[idt]   = v[0]*s1 + v[2]*s2;
2919     x[1+idt] = v[1]*s1 + v[3]*s2;
2920   }
2921 
2922   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2923   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2924   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2925   PetscFunctionReturn(0);
2926 }
2927 
2928 #undef __FUNCT__
2929 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2930 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2931 {
2932   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2933   IS              iscol=a->col,isrow=a->row;
2934   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2935   int             *diag = a->diag;
2936   MatScalar       *aa=a->a,*v;
2937   PetscScalar     *x,*b,s1,*t;
2938 
2939   PetscFunctionBegin;
2940   if (!n) PetscFunctionReturn(0);
2941 
2942   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2943   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2944   t  = a->solve_work;
2945 
2946   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2947   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2948 
2949   /* forward solve the lower triangular */
2950   t[0] = b[*r++];
2951   for (i=1; i<n; i++) {
2952     v     = aa + ai[i];
2953     vi    = aj + ai[i];
2954     nz    = diag[i] - ai[i];
2955     s1  = b[*r++];
2956     while (nz--) {
2957       s1 -= (*v++)*t[*vi++];
2958     }
2959     t[i] = s1;
2960   }
2961   /* backward solve the upper triangular */
2962   for (i=n-1; i>=0; i--){
2963     v    = aa + diag[i] + 1;
2964     vi   = aj + diag[i] + 1;
2965     nz   = ai[i+1] - diag[i] - 1;
2966     s1 = t[i];
2967     while (nz--) {
2968       s1 -= (*v++)*t[*vi++];
2969     }
2970     x[*c--] = t[i] = aa[diag[i]]*s1;
2971   }
2972 
2973   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2974   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2975   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2976   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2977   PetscLogFlops(2*1*(a->nz) - A->n);
2978   PetscFunctionReturn(0);
2979 }
2980 /*
2981       Special case where the matrix was ILU(0) factored in the natural
2982    ordering. This eliminates the need for the column and row permutation.
2983 */
2984 #undef __FUNCT__
2985 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2986 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2987 {
2988   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2989   int             n=a->mbs,*ai=a->i,*aj=a->j;
2990   PetscErrorCode ierr;
2991   int        *diag = a->diag;
2992   MatScalar       *aa=a->a;
2993   PetscScalar     *x,*b;
2994   PetscScalar     s1,x1;
2995   MatScalar       *v;
2996   int             jdx,idt,idx,nz,*vi,i;
2997 
2998   PetscFunctionBegin;
2999   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3000   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3001 
3002   /* forward solve the lower triangular */
3003   idx    = 0;
3004   x[0]   = b[0];
3005   for (i=1; i<n; i++) {
3006     v     =  aa      + ai[i];
3007     vi    =  aj      + ai[i];
3008     nz    =  diag[i] - ai[i];
3009     idx   +=  1;
3010     s1  =  b[idx];
3011     while (nz--) {
3012       jdx   = *vi++;
3013       x1    = x[jdx];
3014       s1 -= v[0]*x1;
3015       v    += 1;
3016     }
3017     x[idx]   = s1;
3018   }
3019   /* backward solve the upper triangular */
3020   for (i=n-1; i>=0; i--){
3021     v    = aa + diag[i] + 1;
3022     vi   = aj + diag[i] + 1;
3023     nz   = ai[i+1] - diag[i] - 1;
3024     idt  = i;
3025     s1 = x[idt];
3026     while (nz--) {
3027       idx   = *vi++;
3028       x1    = x[idx];
3029       s1 -= v[0]*x1;
3030       v    += 1;
3031     }
3032     v        = aa +  diag[i];
3033     x[idt]   = v[0]*s1;
3034   }
3035   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3037   PetscLogFlops(2*(a->nz) - A->n);
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 /* ----------------------------------------------------------------*/
3042 /*
3043      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3044    except that the data structure of Mat_SeqAIJ is slightly different.
3045    Not a good example of code reuse.
3046 */
3047 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3048 
3049 #undef __FUNCT__
3050 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3051 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3052 {
3053   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3054   IS          isicol;
3055   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3056   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3057   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3058   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3059   PetscTruth  col_identity,row_identity;
3060   PetscReal   f;
3061 
3062   PetscFunctionBegin;
3063   f             = info->fill;
3064   levels        = (int)info->levels;
3065   diagonal_fill = (int)info->diagonal_fill;
3066   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3067   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3068   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3069 
3070   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3071     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3072     (*fact)->factor = FACTOR_LU;
3073     b               = (Mat_SeqBAIJ*)(*fact)->data;
3074     if (!b->diag) {
3075       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3076     }
3077     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3078     b->row        = isrow;
3079     b->col        = iscol;
3080     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3081     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3082     b->icol       = isicol;
3083     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3084     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3085   } else { /* general case perform the symbolic factorization */
3086     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3087     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3088 
3089     /* get new row pointers */
3090     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
3091     ainew[0] = 0;
3092     /* don't know how many column pointers are needed so estimate */
3093     jmax = (int)(f*ai[n] + 1);
3094     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
3095     /* ajfill is level of fill for each fill entry */
3096     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
3097     /* fill is a linked list of nonzeros in active row */
3098     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
3099     /* im is level for each filled value */
3100     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
3101     /* dloc is location of diagonal in factor */
3102     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
3103     dloc[0]  = 0;
3104     for (prow=0; prow<n; prow++) {
3105 
3106       /* copy prow into linked list */
3107       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3108       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3109       xi         = aj + ai[r[prow]];
3110       fill[n]    = n;
3111       fill[prow] = -1; /* marker for diagonal entry */
3112       while (nz--) {
3113 	fm  = n;
3114 	idx = ic[*xi++];
3115 	do {
3116 	  m  = fm;
3117 	  fm = fill[m];
3118 	} while (fm < idx);
3119 	fill[m]   = idx;
3120 	fill[idx] = fm;
3121 	im[idx]   = 0;
3122       }
3123 
3124       /* make sure diagonal entry is included */
3125       if (diagonal_fill && fill[prow] == -1) {
3126 	fm = n;
3127 	while (fill[fm] < prow) fm = fill[fm];
3128 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3129 	fill[fm]   = prow;
3130 	im[prow]   = 0;
3131 	nzf++;
3132 	dcount++;
3133       }
3134 
3135       nzi = 0;
3136       row = fill[n];
3137       while (row < prow) {
3138 	incrlev = im[row] + 1;
3139 	nz      = dloc[row];
3140 	xi      = ajnew  + ainew[row] + nz + 1;
3141 	flev    = ajfill + ainew[row] + nz + 1;
3142 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3143 	fm      = row;
3144 	while (nnz-- > 0) {
3145 	  idx = *xi++;
3146 	  if (*flev + incrlev > levels) {
3147 	    flev++;
3148 	    continue;
3149 	  }
3150 	  do {
3151 	    m  = fm;
3152 	    fm = fill[m];
3153 	  } while (fm < idx);
3154 	  if (fm != idx) {
3155 	    im[idx]   = *flev + incrlev;
3156 	    fill[m]   = idx;
3157 	    fill[idx] = fm;
3158 	    fm        = idx;
3159 	    nzf++;
3160 	  } else {
3161 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3162 	  }
3163 	  flev++;
3164 	}
3165 	row = fill[row];
3166 	nzi++;
3167       }
3168       /* copy new filled row into permanent storage */
3169       ainew[prow+1] = ainew[prow] + nzf;
3170       if (ainew[prow+1] > jmax) {
3171 
3172 	/* estimate how much additional space we will need */
3173 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3174 	/* just double the memory each time */
3175 	int maxadd = jmax;
3176 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3177 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3178 	jmax += maxadd;
3179 
3180 	/* allocate a longer ajnew and ajfill */
3181 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3182 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3183 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3184 	ajnew = xi;
3185 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3186 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3187 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3188 	ajfill = xi;
3189 	reallocate++; /* count how many reallocations are needed */
3190       }
3191       xi          = ajnew + ainew[prow];
3192       flev        = ajfill + ainew[prow];
3193       dloc[prow]  = nzi;
3194       fm          = fill[n];
3195       while (nzf--) {
3196 	*xi++   = fm;
3197 	*flev++ = im[fm];
3198 	fm      = fill[fm];
3199       }
3200       /* make sure row has diagonal entry */
3201       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3202 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3203     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3204       }
3205     }
3206     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3207     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3208     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3209     ierr = PetscFree(fill);CHKERRQ(ierr);
3210     ierr = PetscFree(im);CHKERRQ(ierr);
3211 
3212     {
3213       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3214       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3215       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3216       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3217       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3218       if (diagonal_fill) {
3219 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3220       }
3221     }
3222 
3223     /* put together the new matrix */
3224     ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr);
3225     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3226     ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr);
3227     PetscLogObjectParent(*fact,isicol);
3228     b = (Mat_SeqBAIJ*)(*fact)->data;
3229     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3230     b->singlemalloc = PETSC_FALSE;
3231     len = bs2*ainew[n]*sizeof(MatScalar);
3232     /* the next line frees the default space generated by the Create() */
3233     ierr = PetscFree(b->a);CHKERRQ(ierr);
3234     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3235     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3236     b->j          = ajnew;
3237     b->i          = ainew;
3238     for (i=0; i<n; i++) dloc[i] += ainew[i];
3239     b->diag       = dloc;
3240     b->ilen       = 0;
3241     b->imax       = 0;
3242     b->row        = isrow;
3243     b->col        = iscol;
3244     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3245     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3246     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3247     b->icol       = isicol;
3248     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3249     /* In b structure:  Free imax, ilen, old a, old j.
3250        Allocate dloc, solve_work, new a, new j */
3251     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3252     b->maxnz          = b->nz = ainew[n];
3253     (*fact)->factor   = FACTOR_LU;
3254 
3255     (*fact)->info.factor_mallocs    = reallocate;
3256     (*fact)->info.fill_ratio_given  = f;
3257     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3258   }
3259 
3260   if (row_identity && col_identity) {
3261     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3262   }
3263   PetscFunctionReturn(0);
3264 }
3265 
3266 #undef __FUNCT__
3267 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3268 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3269 {
3270   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3271   /* int i,*AJ=a->j,nz=a->nz; */
3272   PetscFunctionBegin;
3273   /* Undo Column scaling */
3274 /*    while (nz--) { */
3275 /*      AJ[i] = AJ[i]/4; */
3276 /*    } */
3277   /* This should really invoke a push/pop logic, but we don't have that yet. */
3278   A->ops->setunfactored = PETSC_NULL;
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 #undef __FUNCT__
3283 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3284 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3285 {
3286   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3287   int *AJ=a->j,nz=a->nz;
3288   unsigned short *aj=(unsigned short *)AJ;
3289   PetscFunctionBegin;
3290   /* Is this really necessary? */
3291   while (nz--) {
3292     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3293   }
3294   A->ops->setunfactored = PETSC_NULL;
3295   PetscFunctionReturn(0);
3296 }
3297 
3298 #undef __FUNCT__
3299 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3300 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3301 {
3302   /*
3303       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3304       with natural ordering
3305   */
3306   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3307 
3308   PetscFunctionBegin;
3309   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3310   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3311   switch (a->bs) {
3312   case 1:
3313     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3314     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3315     break;
3316   case 2:
3317     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3318     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3319     break;
3320   case 3:
3321     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3322     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3323     break;
3324   case 4:
3325 #if defined(PETSC_USE_MAT_SINGLE)
3326     {
3327       PetscTruth  sse_enabled_local;
3328       PetscErrorCode ierr;
3329       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3330       if (sse_enabled_local) {
3331 #  if defined(PETSC_HAVE_SSE)
3332         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3333         if (n==(unsigned short)n) {
3334           unsigned short *aj=(unsigned short *)AJ;
3335           for (i=0;i<nz;i++) {
3336             aj[i] = (unsigned short)AJ[i];
3337           }
3338           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3339           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3340           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3341         } else {
3342         /* Scale the column indices for easier indexing in MatSolve. */
3343 /*            for (i=0;i<nz;i++) { */
3344 /*              AJ[i] = AJ[i]*4; */
3345 /*            } */
3346           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3347           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3348           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3349         }
3350 #  else
3351       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3352         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3353 #  endif
3354       } else {
3355         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3356         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3357       }
3358     }
3359 #else
3360     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3361     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3362 #endif
3363     break;
3364   case 5:
3365     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3366     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3367     break;
3368   case 6:
3369     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3370     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3371     break;
3372   case 7:
3373     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3374     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3375     break;
3376   }
3377   PetscFunctionReturn(0);
3378 }
3379 
3380 #undef __FUNCT__
3381 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3382 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3383 {
3384   /*
3385       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3386       with natural ordering
3387   */
3388   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3389   IS          row = a->row, col = a->col;
3390   PetscTruth  row_identity, col_identity;
3391   PetscTruth  use_natural;
3392   PetscErrorCode ierr;
3393 
3394   PetscFunctionBegin;
3395 
3396   use_natural = PETSC_FALSE;
3397   if (row && col) {
3398     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3399     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3400 
3401     if (row_identity && col_identity) {
3402       use_natural = PETSC_TRUE;
3403     }
3404   } else {
3405     use_natural = PETSC_TRUE;
3406   }
3407 
3408   switch (a->bs) {
3409   case 1:
3410     if (use_natural) {
3411       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3412       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3413       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3414       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3415     } else {
3416       A->ops->solve           = MatSolve_SeqBAIJ_1;
3417       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3418     }
3419     break;
3420   case 2:
3421     if (use_natural) {
3422       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3423       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3424       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3425       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3426     } else {
3427       A->ops->solve           = MatSolve_SeqBAIJ_2;
3428       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3429     }
3430     break;
3431   case 3:
3432     if (use_natural) {
3433       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3434       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3435       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3436       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3437     } else {
3438       A->ops->solve           = MatSolve_SeqBAIJ_3;
3439       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3440     }
3441     break;
3442   case 4:
3443     {
3444       PetscTruth sse_enabled_local;
3445       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3446       if (use_natural) {
3447 #if defined(PETSC_USE_MAT_SINGLE)
3448         if (sse_enabled_local) { /* Natural + Single + SSE */
3449 #  if defined(PETSC_HAVE_SSE)
3450           int n=a->mbs;
3451           if (n==(unsigned short)n) {
3452             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3453             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3454           } else {
3455             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3456             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3457           }
3458 #  else
3459           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3460           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3461 #  endif
3462         } else { /* Natural + Single */
3463           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3464           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3465         }
3466 #else
3467         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3468         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3469 #endif
3470         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3471         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3472       } else { /* Arbitrary ordering */
3473 #if defined(PETSC_USE_MAT_SINGLE)
3474         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3475 #  if defined(PETSC_HAVE_SSE)
3476           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3477           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3478 #  else
3479           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3480           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3481 #  endif
3482         } else { /* Arbitrary + Single */
3483           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3484           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3485         }
3486 #else
3487         A->ops->solve           = MatSolve_SeqBAIJ_4;
3488 #endif
3489         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3490       }
3491     }
3492     break;
3493   case 5:
3494     if (use_natural) {
3495       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3496       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3497       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3498       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3499     } else {
3500       A->ops->solve           = MatSolve_SeqBAIJ_5;
3501       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3502     }
3503     break;
3504   case 6:
3505     if (use_natural) {
3506       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3507       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3508       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3509       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3510     } else {
3511       A->ops->solve           = MatSolve_SeqBAIJ_6;
3512       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3513     }
3514     break;
3515   case 7:
3516     if (use_natural) {
3517       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3518       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3519       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3520       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3521     } else {
3522       A->ops->solve           = MatSolve_SeqBAIJ_7;
3523       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3524     }
3525     break;
3526   default:
3527     A->ops->solve             = MatSolve_SeqBAIJ_N;
3528     break;
3529   }
3530   PetscFunctionReturn(0);
3531 }
3532 
3533 #undef __FUNCT__
3534 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3535 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3536   PetscErrorCode ierr;
3537 
3538   PetscFunctionBegin;
3539   ierr = MatSeqBAIJ_UpdateSolvers(A);
3540   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3541     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3542   } else {
3543     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3544   }
3545   PetscFunctionReturn(0);
3546 }
3547 
3548 #undef __FUNCT__
3549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3550 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3551   PetscErrorCode ierr;
3552 
3553   PetscFunctionBegin;
3554   ierr = MatSeqBAIJ_UpdateSolvers(A);
3555   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3556   PetscFunctionReturn(0);
3557 }
3558 
3559 
3560