xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ea709b57ddb8cc304f11df2e9a39d6a8fdb28b53)
1 /*$Id: baijfact2.c,v 1.69 2001/08/06 21:15:36 bsmith Exp balay $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   PetscScalar     s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*b[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PetscLogFlops(2*(a->nz) - A->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60   int             *diag = a->diag,oidx;
61   MatScalar       *aa=a->a,*v;
62   PetscScalar     s1,s2,x1,x2;
63   PetscScalar     *x,*b;
64 
65   PetscFunctionBegin;
66   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68 
69   /* forward solve the U^T */
70   idx = 0;
71   for (i=0; i<n; i++) {
72 
73     v     = aa + 4*diag[i];
74     /* multiply by the inverse of the block diagonal */
75     x1 = b[idx];   x2 = b[1+idx];
76     s1 = v[0]*x1  +  v[1]*x2;
77     s2 = v[2]*x1  +  v[3]*x2;
78     v += 4;
79 
80     vi    = aj + diag[i] + 1;
81     nz    = ai[i+1] - diag[i] - 1;
82     while (nz--) {
83       oidx = 2*(*vi++);
84       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86       v  += 4;
87     }
88     x[idx]   = s1;x[1+idx] = s2;
89     idx += 2;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--){
93     v    = aa + 4*diag[i] - 4;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     idt  = 2*i;
97     s1   = x[idt];  s2 = x[1+idt];
98     while (nz--) {
99       idx   = 2*(*vi--);
100       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102       v -= 4;
103     }
104   }
105   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107   PetscLogFlops(2*4*(a->nz) - 2*A->n);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114 {
115   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117   int             *diag = a->diag,oidx;
118   MatScalar       *aa=a->a,*v;
119   PetscScalar     s1,s2,s3,x1,x2,x3;
120   PetscScalar     *x,*b;
121 
122   PetscFunctionBegin;
123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125 
126   /* forward solve the U^T */
127   idx = 0;
128   for (i=0; i<n; i++) {
129 
130     v     = aa + 9*diag[i];
131     /* multiply by the inverse of the block diagonal */
132     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
133     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
134     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
135     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
136     v += 9;
137 
138     vi    = aj + diag[i] + 1;
139     nz    = ai[i+1] - diag[i] - 1;
140     while (nz--) {
141       oidx = 3*(*vi++);
142       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
143       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
144       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
145       v  += 9;
146     }
147     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
148     idx += 3;
149   }
150   /* backward solve the L^T */
151   for (i=n-1; i>=0; i--){
152     v    = aa + 9*diag[i] - 9;
153     vi   = aj + diag[i] - 1;
154     nz   = diag[i] - ai[i];
155     idt  = 3*i;
156     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
157     while (nz--) {
158       idx   = 3*(*vi--);
159       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
160       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
161       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
162       v -= 9;
163     }
164   }
165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
167   PetscLogFlops(2*9*(a->nz) - 3*A->n);
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
173 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
176   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
177   int             *diag = a->diag,oidx;
178   MatScalar       *aa=a->a,*v;
179   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
180   PetscScalar     *x,*b;
181 
182   PetscFunctionBegin;
183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185 
186   /* forward solve the U^T */
187   idx = 0;
188   for (i=0; i<n; i++) {
189 
190     v     = aa + 16*diag[i];
191     /* multiply by the inverse of the block diagonal */
192     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
193     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
194     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
195     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
196     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
197     v += 16;
198 
199     vi    = aj + diag[i] + 1;
200     nz    = ai[i+1] - diag[i] - 1;
201     while (nz--) {
202       oidx = 4*(*vi++);
203       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
204       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
205       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
206       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
207       v  += 16;
208     }
209     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
210     idx += 4;
211   }
212   /* backward solve the L^T */
213   for (i=n-1; i>=0; i--){
214     v    = aa + 16*diag[i] - 16;
215     vi   = aj + diag[i] - 1;
216     nz   = diag[i] - ai[i];
217     idt  = 4*i;
218     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
219     while (nz--) {
220       idx   = 4*(*vi--);
221       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
222       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
223       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
224       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
225       v -= 16;
226     }
227   }
228   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
229   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
230   PetscLogFlops(2*16*(a->nz) - 4*A->n);
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
236 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
237 {
238   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
239   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
240   int             *diag = a->diag,oidx;
241   MatScalar       *aa=a->a,*v;
242   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
243   PetscScalar     *x,*b;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248 
249   /* forward solve the U^T */
250   idx = 0;
251   for (i=0; i<n; i++) {
252 
253     v     = aa + 25*diag[i];
254     /* multiply by the inverse of the block diagonal */
255     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
256     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
257     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
258     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
259     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
260     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
261     v += 25;
262 
263     vi    = aj + diag[i] + 1;
264     nz    = ai[i+1] - diag[i] - 1;
265     while (nz--) {
266       oidx = 5*(*vi++);
267       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
268       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
269       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
270       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
271       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
272       v  += 25;
273     }
274     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
275     idx += 5;
276   }
277   /* backward solve the L^T */
278   for (i=n-1; i>=0; i--){
279     v    = aa + 25*diag[i] - 25;
280     vi   = aj + diag[i] - 1;
281     nz   = diag[i] - ai[i];
282     idt  = 5*i;
283     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
284     while (nz--) {
285       idx   = 5*(*vi--);
286       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
287       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
288       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
289       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
290       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
291       v -= 25;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   PetscLogFlops(2*25*(a->nz) - 5*A->n);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
302 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
305   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306   int             *diag = a->diag,oidx;
307   MatScalar       *aa=a->a,*v;
308   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
309   PetscScalar     *x,*b;
310 
311   PetscFunctionBegin;
312   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
313   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
314 
315   /* forward solve the U^T */
316   idx = 0;
317   for (i=0; i<n; i++) {
318 
319     v     = aa + 36*diag[i];
320     /* multiply by the inverse of the block diagonal */
321     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
322     x6    = b[5+idx];
323     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
324     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
325     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
326     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
327     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
328     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
329     v += 36;
330 
331     vi    = aj + diag[i] + 1;
332     nz    = ai[i+1] - diag[i] - 1;
333     while (nz--) {
334       oidx = 6*(*vi++);
335       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
336       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
337       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
338       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
339       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
340       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
341       v  += 36;
342     }
343     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
344     x[5+idx] = s6;
345     idx += 6;
346   }
347   /* backward solve the L^T */
348   for (i=n-1; i>=0; i--){
349     v    = aa + 36*diag[i] - 36;
350     vi   = aj + diag[i] - 1;
351     nz   = diag[i] - ai[i];
352     idt  = 6*i;
353     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
354     s6 = x[5+idt];
355     while (nz--) {
356       idx   = 6*(*vi--);
357       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
358       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
359       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
360       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
361       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
362       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
363       v -= 36;
364     }
365   }
366   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
368   PetscLogFlops(2*36*(a->nz) - 6*A->n);
369   PetscFunctionReturn(0);
370 }
371 
372 #undef __FUNCT__
373 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
374 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
375 {
376   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
377   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378   int             *diag = a->diag,oidx;
379   MatScalar       *aa=a->a,*v;
380   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
381   PetscScalar     *x,*b;
382 
383   PetscFunctionBegin;
384   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386 
387   /* forward solve the U^T */
388   idx = 0;
389   for (i=0; i<n; i++) {
390 
391     v     = aa + 49*diag[i];
392     /* multiply by the inverse of the block diagonal */
393     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
394     x6    = b[5+idx]; x7 = b[6+idx];
395     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
396     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
397     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
398     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
399     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
400     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
401     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
402     v += 49;
403 
404     vi    = aj + diag[i] + 1;
405     nz    = ai[i+1] - diag[i] - 1;
406     while (nz--) {
407       oidx = 7*(*vi++);
408       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
409       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
410       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
411       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
412       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
413       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
414       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
415       v  += 49;
416     }
417     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
418     x[5+idx] = s6;x[6+idx] = s7;
419     idx += 7;
420   }
421   /* backward solve the L^T */
422   for (i=n-1; i>=0; i--){
423     v    = aa + 49*diag[i] - 49;
424     vi   = aj + diag[i] - 1;
425     nz   = diag[i] - ai[i];
426     idt  = 7*i;
427     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
428     s6 = x[5+idt];s7 = x[6+idt];
429     while (nz--) {
430       idx   = 7*(*vi--);
431       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
432       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
433       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
434       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
435       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
436       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
437       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
438       v -= 49;
439     }
440   }
441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443   PetscLogFlops(2*49*(a->nz) - 7*A->n);
444   PetscFunctionReturn(0);
445 }
446 
447 /*---------------------------------------------------------------------------------------------*/
448 #undef __FUNCT__
449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
450 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453   IS              iscol=a->col,isrow=a->row;
454   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455   int             *diag = a->diag;
456   MatScalar       *aa=a->a,*v;
457   PetscScalar     s1,*x,*b,*t;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   t  = a->solve_work;
463 
464   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466 
467   /* copy the b into temp work space according to permutation */
468   for (i=0; i<n; i++) {
469     t[i] = b[c[i]];
470   }
471 
472   /* forward solve the U^T */
473   for (i=0; i<n; i++) {
474 
475     v     = aa + diag[i];
476     /* multiply by the inverse of the block diagonal */
477     s1    = (*v++)*t[i];
478     vi    = aj + diag[i] + 1;
479     nz    = ai[i+1] - diag[i] - 1;
480     while (nz--) {
481       t[*vi++]  -= (*v++)*s1;
482     }
483     t[i]   = s1;
484   }
485   /* backward solve the L^T */
486   for (i=n-1; i>=0; i--){
487     v    = aa + diag[i] - 1;
488     vi   = aj + diag[i] - 1;
489     nz   = diag[i] - ai[i];
490     s1   = t[i];
491     while (nz--) {
492       t[*vi--]   -=  (*v--)*s1;
493     }
494   }
495 
496   /* copy t into x according to permutation */
497   for (i=0; i<n; i++) {
498     x[r[i]]   = t[i];
499   }
500 
501   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505   PetscLogFlops(2*(a->nz) - A->n);
506   PetscFunctionReturn(0);
507 }
508 
509 #undef __FUNCT__
510 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
511 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
512 {
513   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514   IS              iscol=a->col,isrow=a->row;
515   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516   int             *diag = a->diag,ii,ic,ir,oidx;
517   MatScalar       *aa=a->a,*v;
518   PetscScalar     s1,s2,x1,x2;
519   PetscScalar     *x,*b,*t;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   t  = a->solve_work;
525 
526   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528 
529   /* copy the b into temp work space according to permutation */
530   ii = 0;
531   for (i=0; i<n; i++) {
532     ic      = 2*c[i];
533     t[ii]   = b[ic];
534     t[ii+1] = b[ic+1];
535     ii += 2;
536   }
537 
538   /* forward solve the U^T */
539   idx = 0;
540   for (i=0; i<n; i++) {
541 
542     v     = aa + 4*diag[i];
543     /* multiply by the inverse of the block diagonal */
544     x1    = t[idx];   x2 = t[1+idx];
545     s1 = v[0]*x1  +  v[1]*x2;
546     s2 = v[2]*x1  +  v[3]*x2;
547     v += 4;
548 
549     vi    = aj + diag[i] + 1;
550     nz    = ai[i+1] - diag[i] - 1;
551     while (nz--) {
552       oidx = 2*(*vi++);
553       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555       v  += 4;
556     }
557     t[idx]   = s1;t[1+idx] = s2;
558     idx += 2;
559   }
560   /* backward solve the L^T */
561   for (i=n-1; i>=0; i--){
562     v    = aa + 4*diag[i] - 4;
563     vi   = aj + diag[i] - 1;
564     nz   = diag[i] - ai[i];
565     idt  = 2*i;
566     s1 = t[idt];  s2 = t[1+idt];
567     while (nz--) {
568       idx   = 2*(*vi--);
569       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571       v -= 4;
572     }
573   }
574 
575   /* copy t into x according to permutation */
576   ii = 0;
577   for (i=0; i<n; i++) {
578     ir      = 2*r[i];
579     x[ir]   = t[ii];
580     x[ir+1] = t[ii+1];
581     ii += 2;
582   }
583 
584   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PetscLogFlops(2*4*(a->nz) - 2*A->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNCT__
593 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
594 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599   int             *diag = a->diag,ii,ic,ir,oidx;
600   MatScalar       *aa=a->a,*v;
601   PetscScalar     s1,s2,s3,x1,x2,x3;
602   PetscScalar     *x,*b,*t;
603 
604   PetscFunctionBegin;
605   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607   t  = a->solve_work;
608 
609   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611 
612   /* copy the b into temp work space according to permutation */
613   ii = 0;
614   for (i=0; i<n; i++) {
615     ic      = 3*c[i];
616     t[ii]   = b[ic];
617     t[ii+1] = b[ic+1];
618     t[ii+2] = b[ic+2];
619     ii += 3;
620   }
621 
622   /* forward solve the U^T */
623   idx = 0;
624   for (i=0; i<n; i++) {
625 
626     v     = aa + 9*diag[i];
627     /* multiply by the inverse of the block diagonal */
628     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632     v += 9;
633 
634     vi    = aj + diag[i] + 1;
635     nz    = ai[i+1] - diag[i] - 1;
636     while (nz--) {
637       oidx = 3*(*vi++);
638       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641       v  += 9;
642     }
643     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644     idx += 3;
645   }
646   /* backward solve the L^T */
647   for (i=n-1; i>=0; i--){
648     v    = aa + 9*diag[i] - 9;
649     vi   = aj + diag[i] - 1;
650     nz   = diag[i] - ai[i];
651     idt  = 3*i;
652     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653     while (nz--) {
654       idx   = 3*(*vi--);
655       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v -= 9;
659     }
660   }
661 
662   /* copy t into x according to permutation */
663   ii = 0;
664   for (i=0; i<n; i++) {
665     ir      = 3*r[i];
666     x[ir]   = t[ii];
667     x[ir+1] = t[ii+1];
668     x[ir+2] = t[ii+2];
669     ii += 3;
670   }
671 
672   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   PetscLogFlops(2*9*(a->nz) - 3*A->n);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNCT__
681 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
682 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685   IS              iscol=a->col,isrow=a->row;
686   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687   int             *diag = a->diag,ii,ic,ir,oidx;
688   MatScalar       *aa=a->a,*v;
689   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
690   PetscScalar     *x,*b,*t;
691 
692   PetscFunctionBegin;
693   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695   t  = a->solve_work;
696 
697   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699 
700   /* copy the b into temp work space according to permutation */
701   ii = 0;
702   for (i=0; i<n; i++) {
703     ic      = 4*c[i];
704     t[ii]   = b[ic];
705     t[ii+1] = b[ic+1];
706     t[ii+2] = b[ic+2];
707     t[ii+3] = b[ic+3];
708     ii += 4;
709   }
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for (i=0; i<n; i++) {
714 
715     v     = aa + 16*diag[i];
716     /* multiply by the inverse of the block diagonal */
717     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722     v += 16;
723 
724     vi    = aj + diag[i] + 1;
725     nz    = ai[i+1] - diag[i] - 1;
726     while (nz--) {
727       oidx = 4*(*vi++);
728       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732       v  += 16;
733     }
734     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735     idx += 4;
736   }
737   /* backward solve the L^T */
738   for (i=n-1; i>=0; i--){
739     v    = aa + 16*diag[i] - 16;
740     vi   = aj + diag[i] - 1;
741     nz   = diag[i] - ai[i];
742     idt  = 4*i;
743     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744     while (nz--) {
745       idx   = 4*(*vi--);
746       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v -= 16;
751     }
752   }
753 
754   /* copy t into x according to permutation */
755   ii = 0;
756   for (i=0; i<n; i++) {
757     ir      = 4*r[i];
758     x[ir]   = t[ii];
759     x[ir+1] = t[ii+1];
760     x[ir+2] = t[ii+2];
761     x[ir+3] = t[ii+3];
762     ii += 4;
763   }
764 
765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   PetscLogFlops(2*16*(a->nz) - 4*A->n);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
775 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778   IS              iscol=a->col,isrow=a->row;
779   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780   int             *diag = a->diag,ii,ic,ir,oidx;
781   MatScalar       *aa=a->a,*v;
782   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783   PetscScalar     *x,*b,*t;
784 
785   PetscFunctionBegin;
786   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788   t  = a->solve_work;
789 
790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792 
793   /* copy the b into temp work space according to permutation */
794   ii = 0;
795   for (i=0; i<n; i++) {
796     ic      = 5*c[i];
797     t[ii]   = b[ic];
798     t[ii+1] = b[ic+1];
799     t[ii+2] = b[ic+2];
800     t[ii+3] = b[ic+3];
801     t[ii+4] = b[ic+4];
802     ii += 5;
803   }
804 
805   /* forward solve the U^T */
806   idx = 0;
807   for (i=0; i<n; i++) {
808 
809     v     = aa + 25*diag[i];
810     /* multiply by the inverse of the block diagonal */
811     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817     v += 25;
818 
819     vi    = aj + diag[i] + 1;
820     nz    = ai[i+1] - diag[i] - 1;
821     while (nz--) {
822       oidx = 5*(*vi++);
823       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828       v  += 25;
829     }
830     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831     idx += 5;
832   }
833   /* backward solve the L^T */
834   for (i=n-1; i>=0; i--){
835     v    = aa + 25*diag[i] - 25;
836     vi   = aj + diag[i] - 1;
837     nz   = diag[i] - ai[i];
838     idt  = 5*i;
839     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840     while (nz--) {
841       idx   = 5*(*vi--);
842       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v -= 25;
848     }
849   }
850 
851   /* copy t into x according to permutation */
852   ii = 0;
853   for (i=0; i<n; i++) {
854     ir      = 5*r[i];
855     x[ir]   = t[ii];
856     x[ir+1] = t[ii+1];
857     x[ir+2] = t[ii+2];
858     x[ir+3] = t[ii+3];
859     x[ir+4] = t[ii+4];
860     ii += 5;
861   }
862 
863   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867   PetscLogFlops(2*25*(a->nz) - 5*A->n);
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNCT__
872 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
873 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
874 {
875   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876   IS              iscol=a->col,isrow=a->row;
877   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878   int             *diag = a->diag,ii,ic,ir,oidx;
879   MatScalar       *aa=a->a,*v;
880   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881   PetscScalar     *x,*b,*t;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886   t  = a->solve_work;
887 
888   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890 
891   /* copy the b into temp work space according to permutation */
892   ii = 0;
893   for (i=0; i<n; i++) {
894     ic      = 6*c[i];
895     t[ii]   = b[ic];
896     t[ii+1] = b[ic+1];
897     t[ii+2] = b[ic+2];
898     t[ii+3] = b[ic+3];
899     t[ii+4] = b[ic+4];
900     t[ii+5] = b[ic+5];
901     ii += 6;
902   }
903 
904   /* forward solve the U^T */
905   idx = 0;
906   for (i=0; i<n; i++) {
907 
908     v     = aa + 36*diag[i];
909     /* multiply by the inverse of the block diagonal */
910     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911     x6    = t[5+idx];
912     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918     v += 36;
919 
920     vi    = aj + diag[i] + 1;
921     nz    = ai[i+1] - diag[i] - 1;
922     while (nz--) {
923       oidx = 6*(*vi++);
924       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930       v  += 36;
931     }
932     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933     t[5+idx] = s6;
934     idx += 6;
935   }
936   /* backward solve the L^T */
937   for (i=n-1; i>=0; i--){
938     v    = aa + 36*diag[i] - 36;
939     vi   = aj + diag[i] - 1;
940     nz   = diag[i] - ai[i];
941     idt  = 6*i;
942     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943     s6 = t[5+idt];
944     while (nz--) {
945       idx   = 6*(*vi--);
946       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952       v -= 36;
953     }
954   }
955 
956   /* copy t into x according to permutation */
957   ii = 0;
958   for (i=0; i<n; i++) {
959     ir      = 6*r[i];
960     x[ir]   = t[ii];
961     x[ir+1] = t[ii+1];
962     x[ir+2] = t[ii+2];
963     x[ir+3] = t[ii+3];
964     x[ir+4] = t[ii+4];
965     x[ir+5] = t[ii+5];
966     ii += 6;
967   }
968 
969   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973   PetscLogFlops(2*36*(a->nz) - 6*A->n);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNCT__
978 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
979 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982   IS              iscol=a->col,isrow=a->row;
983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984   int             *diag = a->diag,ii,ic,ir,oidx;
985   MatScalar       *aa=a->a,*v;
986   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987   PetscScalar     *x,*b,*t;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   t  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996 
997   /* copy the b into temp work space according to permutation */
998   ii = 0;
999   for (i=0; i<n; i++) {
1000     ic      = 7*c[i];
1001     t[ii]   = b[ic];
1002     t[ii+1] = b[ic+1];
1003     t[ii+2] = b[ic+2];
1004     t[ii+3] = b[ic+3];
1005     t[ii+4] = b[ic+4];
1006     t[ii+5] = b[ic+5];
1007     t[ii+6] = b[ic+6];
1008     ii += 7;
1009   }
1010 
1011   /* forward solve the U^T */
1012   idx = 0;
1013   for (i=0; i<n; i++) {
1014 
1015     v     = aa + 49*diag[i];
1016     /* multiply by the inverse of the block diagonal */
1017     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018     x6    = t[5+idx]; x7 = t[6+idx];
1019     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026     v += 49;
1027 
1028     vi    = aj + diag[i] + 1;
1029     nz    = ai[i+1] - diag[i] - 1;
1030     while (nz--) {
1031       oidx = 7*(*vi++);
1032       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039       v  += 49;
1040     }
1041     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042     t[5+idx] = s6;t[6+idx] = s7;
1043     idx += 7;
1044   }
1045   /* backward solve the L^T */
1046   for (i=n-1; i>=0; i--){
1047     v    = aa + 49*diag[i] - 49;
1048     vi   = aj + diag[i] - 1;
1049     nz   = diag[i] - ai[i];
1050     idt  = 7*i;
1051     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052     s6 = t[5+idt];s7 = t[6+idt];
1053     while (nz--) {
1054       idx   = 7*(*vi--);
1055       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062       v -= 49;
1063     }
1064   }
1065 
1066   /* copy t into x according to permutation */
1067   ii = 0;
1068   for (i=0; i<n; i++) {
1069     ir      = 7*r[i];
1070     x[ir]   = t[ii];
1071     x[ir+1] = t[ii+1];
1072     x[ir+2] = t[ii+2];
1073     x[ir+3] = t[ii+3];
1074     x[ir+4] = t[ii+4];
1075     x[ir+5] = t[ii+5];
1076     x[ir+6] = t[ii+6];
1077     ii += 7;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* ----------------------------------------------------------- */
1089 #undef __FUNCT__
1090 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1091 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1092 {
1093   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1094   IS              iscol=a->col,isrow=a->row;
1095   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1096   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1097   MatScalar       *aa=a->a,*v;
1098   PetscScalar     *x,*b,*s,*t,*ls;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103   t  = a->solve_work;
1104 
1105   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1106   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1107 
1108   /* forward solve the lower triangular */
1109   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1110   for (i=1; i<n; i++) {
1111     v   = aa + bs2*ai[i];
1112     vi  = aj + ai[i];
1113     nz  = a->diag[i] - ai[i];
1114     s = t + bs*i;
1115     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1116     while (nz--) {
1117       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1118       v += bs2;
1119     }
1120   }
1121   /* backward solve the upper triangular */
1122   ls = a->solve_work + A->n;
1123   for (i=n-1; i>=0; i--){
1124     v   = aa + bs2*(a->diag[i] + 1);
1125     vi  = aj + a->diag[i] + 1;
1126     nz  = ai[i+1] - a->diag[i] - 1;
1127     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1137   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNCT__
1145 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1146 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1149   IS              iscol=a->col,isrow=a->row;
1150   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1151   int             *diag = a->diag;
1152   MatScalar       *aa=a->a,*v;
1153   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154   PetscScalar     *x,*b,*t;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   t  = a->solve_work;
1160 
1161   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1162   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1163 
1164   /* forward solve the lower triangular */
1165   idx    = 7*(*r++);
1166   t[0] = b[idx];   t[1] = b[1+idx];
1167   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168   t[5] = b[5+idx]; t[6] = b[6+idx];
1169 
1170   for (i=1; i<n; i++) {
1171     v     = aa + 49*ai[i];
1172     vi    = aj + ai[i];
1173     nz    = diag[i] - ai[i];
1174     idx   = 7*(*r++);
1175     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1177     while (nz--) {
1178       idx   = 7*(*vi++);
1179       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180       x4    = t[3+idx];x5 = t[4+idx];
1181       x6    = t[5+idx];x7 = t[6+idx];
1182       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1189       v += 49;
1190     }
1191     idx = 7*i;
1192     t[idx]   = s1;t[1+idx] = s2;
1193     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194     t[5+idx] = s6;t[6+idx] = s7;
1195   }
1196   /* backward solve the upper triangular */
1197   for (i=n-1; i>=0; i--){
1198     v    = aa + 49*diag[i] + 49;
1199     vi   = aj + diag[i] + 1;
1200     nz   = ai[i+1] - diag[i] - 1;
1201     idt  = 7*i;
1202     s1 = t[idt];  s2 = t[1+idt];
1203     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204     s6 = t[5+idt];s7 = t[6+idt];
1205     while (nz--) {
1206       idx   = 7*(*vi++);
1207       x1    = t[idx];   x2 = t[1+idx];
1208       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209       x6    = t[5+idx]; x7 = t[6+idx];
1210       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1217       v += 49;
1218     }
1219     idc = 7*(*c--);
1220     v   = aa + 49*diag[i];
1221     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1235   }
1236 
1237   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1238   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNCT__
1246 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1247 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1248 {
1249   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1250   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1251   int             ierr,*diag = a->diag,jdx;
1252   MatScalar       *aa=a->a,*v;
1253   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1258   /* forward solve the lower triangular */
1259   idx    = 0;
1260   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1261   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1262   x[6] = b[6+idx];
1263   for (i=1; i<n; i++) {
1264     v     =  aa + 49*ai[i];
1265     vi    =  aj + ai[i];
1266     nz    =  diag[i] - ai[i];
1267     idx   =  7*i;
1268     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270     s7  =  b[6+idx];
1271     while (nz--) {
1272       jdx   = 7*(*vi++);
1273       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1274       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1275       x7    = x[6+jdx];
1276       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1283       v += 49;
1284      }
1285     x[idx]   = s1;
1286     x[1+idx] = s2;
1287     x[2+idx] = s3;
1288     x[3+idx] = s4;
1289     x[4+idx] = s5;
1290     x[5+idx] = s6;
1291     x[6+idx] = s7;
1292   }
1293   /* backward solve the upper triangular */
1294   for (i=n-1; i>=0; i--){
1295     v    = aa + 49*diag[i] + 49;
1296     vi   = aj + diag[i] + 1;
1297     nz   = ai[i+1] - diag[i] - 1;
1298     idt  = 7*i;
1299     s1 = x[idt];   s2 = x[1+idt];
1300     s3 = x[2+idt]; s4 = x[3+idt];
1301     s5 = x[4+idt]; s6 = x[5+idt];
1302     s7 = x[6+idt];
1303     while (nz--) {
1304       idx   = 7*(*vi++);
1305       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1306       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1307       x7    = x[6+idx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316     }
1317     v        = aa + 49*diag[i];
1318     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1332   }
1333 
1334   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNCT__
1341 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1342 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1343 {
1344   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1345   IS              iscol=a->col,isrow=a->row;
1346   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1347   int             *diag = a->diag;
1348   MatScalar       *aa=a->a,*v;
1349   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1358 
1359   /* forward solve the lower triangular */
1360   idx    = 6*(*r++);
1361   t[0] = b[idx];   t[1] = b[1+idx];
1362   t[2] = b[2+idx]; t[3] = b[3+idx];
1363   t[4] = b[4+idx]; t[5] = b[5+idx];
1364   for (i=1; i<n; i++) {
1365     v     = aa + 36*ai[i];
1366     vi    = aj + ai[i];
1367     nz    = diag[i] - ai[i];
1368     idx   = 6*(*r++);
1369     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370     s5  = b[4+idx]; s6 = b[5+idx];
1371     while (nz--) {
1372       idx   = 6*(*vi++);
1373       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1381       v += 36;
1382     }
1383     idx = 6*i;
1384     t[idx]   = s1;t[1+idx] = s2;
1385     t[2+idx] = s3;t[3+idx] = s4;
1386     t[4+idx] = s5;t[5+idx] = s6;
1387   }
1388   /* backward solve the upper triangular */
1389   for (i=n-1; i>=0; i--){
1390     v    = aa + 36*diag[i] + 36;
1391     vi   = aj + diag[i] + 1;
1392     nz   = ai[i+1] - diag[i] - 1;
1393     idt  = 6*i;
1394     s1 = t[idt];  s2 = t[1+idt];
1395     s3 = t[2+idt];s4 = t[3+idt];
1396     s5 = t[4+idt];s6 = t[5+idt];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx];
1400       x3    = t[2+idx]; x4 = t[3+idx];
1401       x5    = t[4+idx]; x6 = t[5+idx];
1402       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1408       v += 36;
1409     }
1410     idc = 6*(*c--);
1411     v   = aa + 36*diag[i];
1412     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423                                  v[23]*s4+v[29]*s5+v[35]*s6;
1424   }
1425 
1426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1428   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNCT__
1435 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1436 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1437 {
1438   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1439   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1440   int             ierr,*diag = a->diag,jdx;
1441   MatScalar       *aa=a->a,*v;
1442   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   /* forward solve the lower triangular */
1448   idx    = 0;
1449   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1450   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1451   for (i=1; i<n; i++) {
1452     v     =  aa + 36*ai[i];
1453     vi    =  aj + ai[i];
1454     nz    =  diag[i] - ai[i];
1455     idx   =  6*i;
1456     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1458     while (nz--) {
1459       jdx   = 6*(*vi++);
1460       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1461       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1468       v += 36;
1469      }
1470     x[idx]   = s1;
1471     x[1+idx] = s2;
1472     x[2+idx] = s3;
1473     x[3+idx] = s4;
1474     x[4+idx] = s5;
1475     x[5+idx] = s6;
1476   }
1477   /* backward solve the upper triangular */
1478   for (i=n-1; i>=0; i--){
1479     v    = aa + 36*diag[i] + 36;
1480     vi   = aj + diag[i] + 1;
1481     nz   = ai[i+1] - diag[i] - 1;
1482     idt  = 6*i;
1483     s1 = x[idt];   s2 = x[1+idt];
1484     s3 = x[2+idt]; s4 = x[3+idt];
1485     s5 = x[4+idt]; s6 = x[5+idt];
1486     while (nz--) {
1487       idx   = 6*(*vi++);
1488       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1489       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497     }
1498     v        = aa + 36*diag[i];
1499     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1505   }
1506 
1507   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNCT__
1514 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1515 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1518   IS              iscol=a->col,isrow=a->row;
1519   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1520   int             *diag = a->diag;
1521   MatScalar       *aa=a->a,*v;
1522   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1523 
1524   PetscFunctionBegin;
1525   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527   t  = a->solve_work;
1528 
1529   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1530   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1531 
1532   /* forward solve the lower triangular */
1533   idx    = 5*(*r++);
1534   t[0] = b[idx];   t[1] = b[1+idx];
1535   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1536   for (i=1; i<n; i++) {
1537     v     = aa + 25*ai[i];
1538     vi    = aj + ai[i];
1539     nz    = diag[i] - ai[i];
1540     idx   = 5*(*r++);
1541     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542     s5  = b[4+idx];
1543     while (nz--) {
1544       idx   = 5*(*vi++);
1545       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546       x4    = t[3+idx];x5 = t[4+idx];
1547       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1552       v += 25;
1553     }
1554     idx = 5*i;
1555     t[idx]   = s1;t[1+idx] = s2;
1556     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1557   }
1558   /* backward solve the upper triangular */
1559   for (i=n-1; i>=0; i--){
1560     v    = aa + 25*diag[i] + 25;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 5*i;
1564     s1 = t[idt];  s2 = t[1+idt];
1565     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1566     while (nz--) {
1567       idx   = 5*(*vi++);
1568       x1    = t[idx];   x2 = t[1+idx];
1569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1575       v += 25;
1576     }
1577     idc = 5*(*c--);
1578     v   = aa + 25*diag[i];
1579     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580                                  v[15]*s4+v[20]*s5;
1581     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582                                  v[16]*s4+v[21]*s5;
1583     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584                                  v[17]*s4+v[22]*s5;
1585     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586                                  v[18]*s4+v[23]*s5;
1587     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588                                  v[19]*s4+v[24]*s5;
1589   }
1590 
1591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1601 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1604   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1605   int             ierr,*diag = a->diag,jdx;
1606   MatScalar       *aa=a->a,*v;
1607   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1612   /* forward solve the lower triangular */
1613   idx    = 0;
1614   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1615   for (i=1; i<n; i++) {
1616     v     =  aa + 25*ai[i];
1617     vi    =  aj + ai[i];
1618     nz    =  diag[i] - ai[i];
1619     idx   =  5*i;
1620     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1621     while (nz--) {
1622       jdx   = 5*(*vi++);
1623       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1629       v    += 25;
1630     }
1631     x[idx]   = s1;
1632     x[1+idx] = s2;
1633     x[2+idx] = s3;
1634     x[3+idx] = s4;
1635     x[4+idx] = s5;
1636   }
1637   /* backward solve the upper triangular */
1638   for (i=n-1; i>=0; i--){
1639     v    = aa + 25*diag[i] + 25;
1640     vi   = aj + diag[i] + 1;
1641     nz   = ai[i+1] - diag[i] - 1;
1642     idt  = 5*i;
1643     s1 = x[idt];  s2 = x[1+idt];
1644     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1645     while (nz--) {
1646       idx   = 5*(*vi++);
1647       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1653       v    += 25;
1654     }
1655     v        = aa + 25*diag[i];
1656     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1661   }
1662 
1663   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 #undef __FUNCT__
1670 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1671 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1672 {
1673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1674   IS              iscol=a->col,isrow=a->row;
1675   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1676   int             *diag = a->diag;
1677   MatScalar       *aa=a->a,*v;
1678   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1679 
1680   PetscFunctionBegin;
1681   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683   t  = a->solve_work;
1684 
1685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1687 
1688   /* forward solve the lower triangular */
1689   idx    = 4*(*r++);
1690   t[0] = b[idx];   t[1] = b[1+idx];
1691   t[2] = b[2+idx]; t[3] = b[3+idx];
1692   for (i=1; i<n; i++) {
1693     v     = aa + 16*ai[i];
1694     vi    = aj + ai[i];
1695     nz    = diag[i] - ai[i];
1696     idx   = 4*(*r++);
1697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1698     while (nz--) {
1699       idx   = 4*(*vi++);
1700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1705       v    += 16;
1706     }
1707     idx        = 4*i;
1708     t[idx]   = s1;t[1+idx] = s2;
1709     t[2+idx] = s3;t[3+idx] = s4;
1710   }
1711   /* backward solve the upper triangular */
1712   for (i=n-1; i>=0; i--){
1713     v    = aa + 16*diag[i] + 16;
1714     vi   = aj + diag[i] + 1;
1715     nz   = ai[i+1] - diag[i] - 1;
1716     idt  = 4*i;
1717     s1 = t[idt];  s2 = t[1+idt];
1718     s3 = t[2+idt];s4 = t[3+idt];
1719     while (nz--) {
1720       idx   = 4*(*vi++);
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1727       v += 16;
1728     }
1729     idc      = 4*(*c--);
1730     v        = aa + 16*diag[i];
1731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1742   PetscFunctionReturn(0);
1743 }
1744 #if defined (PETSC_HAVE_SSE)
1745 
1746 #include PETSC_HAVE_SSE
1747 
1748 #undef __FUNCT__
1749 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1750 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1751 {
1752   /*
1753      Note: This code uses demotion of double
1754      to float when performing the mixed-mode computation.
1755      This may not be numerically reasonable for all applications.
1756   */
1757   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1758   IS              iscol=a->col,isrow=a->row;
1759   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1760   int             *diag = a->diag,ai16;
1761   MatScalar       *aa=a->a,*v;
1762   PetscScalar     *x,*b,*t;
1763 
1764   /* Make space in temp stack for 16 Byte Aligned arrays */
1765   float           ssealignedspace[11],*tmps,*tmpx;
1766   unsigned long   offset;
1767 
1768   PetscFunctionBegin;
1769   SSE_SCOPE_BEGIN;
1770 
1771     offset = (unsigned long)ssealignedspace % 16;
1772     if (offset) offset = (16 - offset)/4;
1773     tmps = &ssealignedspace[offset];
1774     tmpx = &ssealignedspace[offset+4];
1775     PREFETCH_NTA(aa+16*ai[1]);
1776 
1777     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1778     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1779     t  = a->solve_work;
1780 
1781     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1782     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1783 
1784     /* forward solve the lower triangular */
1785     idx  = 4*(*r++);
1786     t[0] = b[idx];   t[1] = b[1+idx];
1787     t[2] = b[2+idx]; t[3] = b[3+idx];
1788     v    =  aa + 16*ai[1];
1789 
1790     for (i=1; i<n;) {
1791       PREFETCH_NTA(&v[8]);
1792       vi   =  aj      + ai[i];
1793       nz   =  diag[i] - ai[i];
1794       idx  =  4*(*r++);
1795 
1796       /* Demote sum from double to float */
1797       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1798       LOAD_PS(tmps,XMM7);
1799 
1800       while (nz--) {
1801         PREFETCH_NTA(&v[16]);
1802         idx = 4*(*vi++);
1803 
1804         /* Demote solution (so far) from double to float */
1805         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1806 
1807         /* 4x4 Matrix-Vector product with negative accumulation: */
1808         SSE_INLINE_BEGIN_2(tmpx,v)
1809           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1810 
1811           /* First Column */
1812           SSE_COPY_PS(XMM0,XMM6)
1813           SSE_SHUFFLE(XMM0,XMM0,0x00)
1814           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1815           SSE_SUB_PS(XMM7,XMM0)
1816 
1817           /* Second Column */
1818           SSE_COPY_PS(XMM1,XMM6)
1819           SSE_SHUFFLE(XMM1,XMM1,0x55)
1820           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1821           SSE_SUB_PS(XMM7,XMM1)
1822 
1823           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1824 
1825           /* Third Column */
1826           SSE_COPY_PS(XMM2,XMM6)
1827           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1828           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1829           SSE_SUB_PS(XMM7,XMM2)
1830 
1831           /* Fourth Column */
1832           SSE_COPY_PS(XMM3,XMM6)
1833           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1834           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1835           SSE_SUB_PS(XMM7,XMM3)
1836         SSE_INLINE_END_2
1837 
1838         v  += 16;
1839       }
1840       idx = 4*i;
1841       v   = aa + 16*ai[++i];
1842       PREFETCH_NTA(v);
1843       STORE_PS(tmps,XMM7);
1844 
1845       /* Promote result from float to double */
1846       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1847     }
1848     /* backward solve the upper triangular */
1849     idt  = 4*(n-1);
1850     ai16 = 16*diag[n-1];
1851     v    = aa + ai16 + 16;
1852     for (i=n-1; i>=0;){
1853       PREFETCH_NTA(&v[8]);
1854       vi = aj + diag[i] + 1;
1855       nz = ai[i+1] - diag[i] - 1;
1856 
1857       /* Demote accumulator from double to float */
1858       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1859       LOAD_PS(tmps,XMM7);
1860 
1861       while (nz--) {
1862         PREFETCH_NTA(&v[16]);
1863         idx = 4*(*vi++);
1864 
1865         /* Demote solution (so far) from double to float */
1866         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1867 
1868         /* 4x4 Matrix-Vector Product with negative accumulation: */
1869         SSE_INLINE_BEGIN_2(tmpx,v)
1870           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1871 
1872           /* First Column */
1873           SSE_COPY_PS(XMM0,XMM6)
1874           SSE_SHUFFLE(XMM0,XMM0,0x00)
1875           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1876           SSE_SUB_PS(XMM7,XMM0)
1877 
1878           /* Second Column */
1879           SSE_COPY_PS(XMM1,XMM6)
1880           SSE_SHUFFLE(XMM1,XMM1,0x55)
1881           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1882           SSE_SUB_PS(XMM7,XMM1)
1883 
1884           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1885 
1886           /* Third Column */
1887           SSE_COPY_PS(XMM2,XMM6)
1888           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1889           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1890           SSE_SUB_PS(XMM7,XMM2)
1891 
1892           /* Fourth Column */
1893           SSE_COPY_PS(XMM3,XMM6)
1894           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1895           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1896           SSE_SUB_PS(XMM7,XMM3)
1897         SSE_INLINE_END_2
1898         v  += 16;
1899       }
1900       v    = aa + ai16;
1901       ai16 = 16*diag[--i];
1902       PREFETCH_NTA(aa+ai16+16);
1903       /*
1904          Scale the result by the diagonal 4x4 block,
1905          which was inverted as part of the factorization
1906       */
1907       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1908         /* First Column */
1909         SSE_COPY_PS(XMM0,XMM7)
1910         SSE_SHUFFLE(XMM0,XMM0,0x00)
1911         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1912 
1913         /* Second Column */
1914         SSE_COPY_PS(XMM1,XMM7)
1915         SSE_SHUFFLE(XMM1,XMM1,0x55)
1916         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1917         SSE_ADD_PS(XMM0,XMM1)
1918 
1919         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1920 
1921         /* Third Column */
1922         SSE_COPY_PS(XMM2,XMM7)
1923         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1924         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1925         SSE_ADD_PS(XMM0,XMM2)
1926 
1927         /* Fourth Column */
1928         SSE_COPY_PS(XMM3,XMM7)
1929         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1930         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1931         SSE_ADD_PS(XMM0,XMM3)
1932 
1933         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1934       SSE_INLINE_END_3
1935 
1936       /* Promote solution from float to double */
1937       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1938 
1939       /* Apply reordering to t and stream into x.    */
1940       /* This way, x doesn't pollute the cache.      */
1941       /* Be careful with size: 2 doubles = 4 floats! */
1942       idc  = 4*(*c--);
1943       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
1944         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1945         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1946         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1947         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1948         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1949         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1950       SSE_INLINE_END_2
1951       v    = aa + ai16 + 16;
1952       idt -= 4;
1953     }
1954 
1955     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1956     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1957     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1958     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1959     PetscLogFlops(2*16*(a->nz) - 4*A->n);
1960   SSE_SCOPE_END;
1961   PetscFunctionReturn(0);
1962 }
1963 
1964 #endif
1965 
1966 
1967 /*
1968       Special case where the matrix was ILU(0) factored in the natural
1969    ordering. This eliminates the need for the column and row permutation.
1970 */
1971 #undef __FUNCT__
1972 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
1973 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1974 {
1975   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1976   int             n=a->mbs,*ai=a->i,*aj=a->j;
1977   int             ierr,*diag = a->diag;
1978   MatScalar       *aa=a->a;
1979   PetscScalar     *x,*b;
1980 
1981   PetscFunctionBegin;
1982   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1983   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1984 
1985 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1986   {
1987     static PetscScalar w[2000]; /* very BAD need to fix */
1988     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1989   }
1990 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1991   {
1992     static PetscScalar w[2000]; /* very BAD need to fix */
1993     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1994   }
1995 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1996   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1997 #else
1998   {
1999     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2000     MatScalar    *v;
2001     int          jdx,idt,idx,nz,*vi,i,ai16;
2002 
2003   /* forward solve the lower triangular */
2004   idx    = 0;
2005   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2006   for (i=1; i<n; i++) {
2007     v     =  aa      + 16*ai[i];
2008     vi    =  aj      + ai[i];
2009     nz    =  diag[i] - ai[i];
2010     idx   +=  4;
2011     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2012     while (nz--) {
2013       jdx   = 4*(*vi++);
2014       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2015       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2016       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2017       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2018       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2019       v    += 16;
2020     }
2021     x[idx]   = s1;
2022     x[1+idx] = s2;
2023     x[2+idx] = s3;
2024     x[3+idx] = s4;
2025   }
2026   /* backward solve the upper triangular */
2027   idt = 4*(n-1);
2028   for (i=n-1; i>=0; i--){
2029     ai16 = 16*diag[i];
2030     v    = aa + ai16 + 16;
2031     vi   = aj + diag[i] + 1;
2032     nz   = ai[i+1] - diag[i] - 1;
2033     s1 = x[idt];  s2 = x[1+idt];
2034     s3 = x[2+idt];s4 = x[3+idt];
2035     while (nz--) {
2036       idx   = 4*(*vi++);
2037       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2038       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2039       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2040       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2041       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2042       v    += 16;
2043     }
2044     v        = aa + ai16;
2045     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2046     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2047     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2048     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2049     idt -= 4;
2050   }
2051   }
2052 #endif
2053 
2054   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2055   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2056   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 #if defined (PETSC_HAVE_SSE)
2061 
2062 #include PETSC_HAVE_SSE
2063 
2064 #undef __FUNCT__
2065 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2066 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2067 {
2068   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2069   int             n=a->mbs,*ai=a->i,*aj=a->j;
2070   int             ierr,*diag = a->diag;
2071   MatScalar       *aa=a->a;
2072   PetscScalar     *x,*b;
2073 
2074   PetscFunctionBegin;
2075   SSE_SCOPE_BEGIN;
2076   /*
2077      Note: This code currently uses demotion of double
2078      to float when performing the mixed-mode computation.
2079      This may not be numerically reasonable for all applications.
2080   */
2081   PREFETCH_NTA(aa+16*ai[1]);
2082 
2083   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2084   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2085   {
2086     MatScalar     *v;
2087     int           jdx,idt,idx,nz,*vi,i,ai16;
2088 
2089     /* Make space in temp stack for 16 Byte Aligned arrays */
2090     float         ssealignedspace[11],*tmps,*tmpx;
2091     unsigned long offset = (unsigned long)ssealignedspace % 16;
2092     if (offset) offset = (16 - offset)/4;
2093     tmps = &ssealignedspace[offset];
2094     tmpx = &ssealignedspace[offset+4];
2095 
2096   /* forward solve the lower triangular */
2097     idx  = 0;
2098     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2099     v    =  aa + 16*ai[1];
2100 
2101     for (i=1; i<n;) {
2102       PREFETCH_NTA(&v[8]);
2103       vi   =  aj      + ai[i];
2104       nz   =  diag[i] - ai[i];
2105       idx +=  4;
2106 
2107     /* Demote sum from double to float */
2108       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2109       LOAD_PS(tmps,XMM7);
2110 
2111       while (nz--) {
2112         PREFETCH_NTA(&v[16]);
2113         jdx = 4*(*vi++);
2114 
2115         /* Demote solution (so far) from double to float */
2116         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
2117 
2118         /* 4x4 Matrix-Vector product with negative accumulation: */
2119         SSE_INLINE_BEGIN_2(tmpx,v)
2120           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2121 
2122           /* First Column */
2123           SSE_COPY_PS(XMM0,XMM6)
2124           SSE_SHUFFLE(XMM0,XMM0,0x00)
2125           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2126           SSE_SUB_PS(XMM7,XMM0)
2127 
2128           /* Second Column */
2129           SSE_COPY_PS(XMM1,XMM6)
2130           SSE_SHUFFLE(XMM1,XMM1,0x55)
2131           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2132           SSE_SUB_PS(XMM7,XMM1)
2133 
2134           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2135 
2136           /* Third Column */
2137           SSE_COPY_PS(XMM2,XMM6)
2138           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2139           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2140           SSE_SUB_PS(XMM7,XMM2)
2141 
2142           /* Fourth Column */
2143           SSE_COPY_PS(XMM3,XMM6)
2144           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2145           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2146           SSE_SUB_PS(XMM7,XMM3)
2147         SSE_INLINE_END_2
2148 
2149         v  += 16;
2150       }
2151       v    =  aa + 16*ai[++i];
2152       PREFETCH_NTA(v);
2153       STORE_PS(tmps,XMM7);
2154 
2155       /* Promote result from float to double */
2156       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
2157     }
2158     /* backward solve the upper triangular */
2159     idt  = 4*(n-1);
2160     ai16 = 16*diag[n-1];
2161     v    = aa + ai16 + 16;
2162     for (i=n-1; i>=0;){
2163       PREFETCH_NTA(&v[8]);
2164       vi = aj + diag[i] + 1;
2165       nz = ai[i+1] - diag[i] - 1;
2166 
2167       /* Demote accumulator from double to float */
2168       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
2169       LOAD_PS(tmps,XMM7);
2170 
2171       while (nz--) {
2172         PREFETCH_NTA(&v[16]);
2173         idx = 4*(*vi++);
2174 
2175         /* Demote solution (so far) from double to float */
2176         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2177 
2178         /* 4x4 Matrix-Vector Product with negative accumulation: */
2179         SSE_INLINE_BEGIN_2(tmpx,v)
2180           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2181 
2182           /* First Column */
2183           SSE_COPY_PS(XMM0,XMM6)
2184           SSE_SHUFFLE(XMM0,XMM0,0x00)
2185           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2186           SSE_SUB_PS(XMM7,XMM0)
2187 
2188           /* Second Column */
2189           SSE_COPY_PS(XMM1,XMM6)
2190           SSE_SHUFFLE(XMM1,XMM1,0x55)
2191           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2192           SSE_SUB_PS(XMM7,XMM1)
2193 
2194           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2195 
2196           /* Third Column */
2197           SSE_COPY_PS(XMM2,XMM6)
2198           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2199           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2200           SSE_SUB_PS(XMM7,XMM2)
2201 
2202           /* Fourth Column */
2203           SSE_COPY_PS(XMM3,XMM6)
2204           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2205           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2206           SSE_SUB_PS(XMM7,XMM3)
2207         SSE_INLINE_END_2
2208         v  += 16;
2209       }
2210       v    = aa + ai16;
2211       ai16 = 16*diag[--i];
2212       PREFETCH_NTA(aa+ai16+16);
2213       /*
2214          Scale the result by the diagonal 4x4 block,
2215          which was inverted as part of the factorization
2216       */
2217       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2218         /* First Column */
2219         SSE_COPY_PS(XMM0,XMM7)
2220         SSE_SHUFFLE(XMM0,XMM0,0x00)
2221         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2222 
2223         /* Second Column */
2224         SSE_COPY_PS(XMM1,XMM7)
2225         SSE_SHUFFLE(XMM1,XMM1,0x55)
2226         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2227         SSE_ADD_PS(XMM0,XMM1)
2228 
2229         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2230 
2231         /* Third Column */
2232         SSE_COPY_PS(XMM2,XMM7)
2233         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2234         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2235         SSE_ADD_PS(XMM0,XMM2)
2236 
2237         /* Fourth Column */
2238         SSE_COPY_PS(XMM3,XMM7)
2239         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2240         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2241         SSE_ADD_PS(XMM0,XMM3)
2242 
2243         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2244       SSE_INLINE_END_3
2245 
2246       /* Promote solution from float to double */
2247       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
2248 
2249       v    = aa + ai16 + 16;
2250       idt -= 4;
2251     }
2252   }
2253   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2254   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2255   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2256   SSE_SCOPE_END;
2257   PetscFunctionReturn(0);
2258 }
2259 
2260 #endif
2261 #undef __FUNCT__
2262 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2263 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2264 {
2265   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2266   IS              iscol=a->col,isrow=a->row;
2267   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2268   int             *diag = a->diag;
2269   MatScalar       *aa=a->a,*v;
2270   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2271 
2272   PetscFunctionBegin;
2273   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2274   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2275   t  = a->solve_work;
2276 
2277   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2278   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2279 
2280   /* forward solve the lower triangular */
2281   idx    = 3*(*r++);
2282   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2283   for (i=1; i<n; i++) {
2284     v     = aa + 9*ai[i];
2285     vi    = aj + ai[i];
2286     nz    = diag[i] - ai[i];
2287     idx   = 3*(*r++);
2288     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2289     while (nz--) {
2290       idx   = 3*(*vi++);
2291       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2292       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2293       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2294       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2295       v += 9;
2296     }
2297     idx = 3*i;
2298     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2299   }
2300   /* backward solve the upper triangular */
2301   for (i=n-1; i>=0; i--){
2302     v    = aa + 9*diag[i] + 9;
2303     vi   = aj + diag[i] + 1;
2304     nz   = ai[i+1] - diag[i] - 1;
2305     idt  = 3*i;
2306     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2307     while (nz--) {
2308       idx   = 3*(*vi++);
2309       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2310       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2311       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2312       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2313       v += 9;
2314     }
2315     idc = 3*(*c--);
2316     v   = aa + 9*diag[i];
2317     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2318     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2319     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2320   }
2321   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2322   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2323   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2324   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2325   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2326   PetscFunctionReturn(0);
2327 }
2328 
2329 /*
2330       Special case where the matrix was ILU(0) factored in the natural
2331    ordering. This eliminates the need for the column and row permutation.
2332 */
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2335 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2336 {
2337   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2338   int             n=a->mbs,*ai=a->i,*aj=a->j;
2339   int             ierr,*diag = a->diag;
2340   MatScalar       *aa=a->a,*v;
2341   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2342   int             jdx,idt,idx,nz,*vi,i;
2343 
2344   PetscFunctionBegin;
2345   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2346   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2347 
2348 
2349   /* forward solve the lower triangular */
2350   idx    = 0;
2351   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2352   for (i=1; i<n; i++) {
2353     v     =  aa      + 9*ai[i];
2354     vi    =  aj      + ai[i];
2355     nz    =  diag[i] - ai[i];
2356     idx   +=  3;
2357     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2358     while (nz--) {
2359       jdx   = 3*(*vi++);
2360       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2361       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2362       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2363       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2364       v    += 9;
2365     }
2366     x[idx]   = s1;
2367     x[1+idx] = s2;
2368     x[2+idx] = s3;
2369   }
2370   /* backward solve the upper triangular */
2371   for (i=n-1; i>=0; i--){
2372     v    = aa + 9*diag[i] + 9;
2373     vi   = aj + diag[i] + 1;
2374     nz   = ai[i+1] - diag[i] - 1;
2375     idt  = 3*i;
2376     s1 = x[idt];  s2 = x[1+idt];
2377     s3 = x[2+idt];
2378     while (nz--) {
2379       idx   = 3*(*vi++);
2380       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2381       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2382       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2383       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2384       v    += 9;
2385     }
2386     v        = aa +  9*diag[i];
2387     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2388     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2389     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2390   }
2391 
2392   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2394   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2395   PetscFunctionReturn(0);
2396 }
2397 
2398 #undef __FUNCT__
2399 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2400 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2401 {
2402   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2403   IS              iscol=a->col,isrow=a->row;
2404   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2405   int             *diag = a->diag;
2406   MatScalar       *aa=a->a,*v;
2407   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2408 
2409   PetscFunctionBegin;
2410   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2411   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2412   t  = a->solve_work;
2413 
2414   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2415   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2416 
2417   /* forward solve the lower triangular */
2418   idx    = 2*(*r++);
2419   t[0] = b[idx]; t[1] = b[1+idx];
2420   for (i=1; i<n; i++) {
2421     v     = aa + 4*ai[i];
2422     vi    = aj + ai[i];
2423     nz    = diag[i] - ai[i];
2424     idx   = 2*(*r++);
2425     s1  = b[idx]; s2 = b[1+idx];
2426     while (nz--) {
2427       idx   = 2*(*vi++);
2428       x1    = t[idx]; x2 = t[1+idx];
2429       s1 -= v[0]*x1 + v[2]*x2;
2430       s2 -= v[1]*x1 + v[3]*x2;
2431       v += 4;
2432     }
2433     idx = 2*i;
2434     t[idx] = s1; t[1+idx] = s2;
2435   }
2436   /* backward solve the upper triangular */
2437   for (i=n-1; i>=0; i--){
2438     v    = aa + 4*diag[i] + 4;
2439     vi   = aj + diag[i] + 1;
2440     nz   = ai[i+1] - diag[i] - 1;
2441     idt  = 2*i;
2442     s1 = t[idt]; s2 = t[1+idt];
2443     while (nz--) {
2444       idx   = 2*(*vi++);
2445       x1    = t[idx]; x2 = t[1+idx];
2446       s1 -= v[0]*x1 + v[2]*x2;
2447       s2 -= v[1]*x1 + v[3]*x2;
2448       v += 4;
2449     }
2450     idc = 2*(*c--);
2451     v   = aa + 4*diag[i];
2452     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2453     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2454   }
2455   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2456   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2459   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2460   PetscFunctionReturn(0);
2461 }
2462 
2463 /*
2464       Special case where the matrix was ILU(0) factored in the natural
2465    ordering. This eliminates the need for the column and row permutation.
2466 */
2467 #undef __FUNCT__
2468 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2469 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2470 {
2471   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2472   int             n=a->mbs,*ai=a->i,*aj=a->j;
2473   int             ierr,*diag = a->diag;
2474   MatScalar       *aa=a->a,*v;
2475   PetscScalar     *x,*b,s1,s2,x1,x2;
2476   int             jdx,idt,idx,nz,*vi,i;
2477 
2478   PetscFunctionBegin;
2479   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2480   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2481 
2482   /* forward solve the lower triangular */
2483   idx    = 0;
2484   x[0]   = b[0]; x[1] = b[1];
2485   for (i=1; i<n; i++) {
2486     v     =  aa      + 4*ai[i];
2487     vi    =  aj      + ai[i];
2488     nz    =  diag[i] - ai[i];
2489     idx   +=  2;
2490     s1  =  b[idx];s2 = b[1+idx];
2491     while (nz--) {
2492       jdx   = 2*(*vi++);
2493       x1    = x[jdx];x2 = x[1+jdx];
2494       s1 -= v[0]*x1 + v[2]*x2;
2495       s2 -= v[1]*x1 + v[3]*x2;
2496       v    += 4;
2497     }
2498     x[idx]   = s1;
2499     x[1+idx] = s2;
2500   }
2501   /* backward solve the upper triangular */
2502   for (i=n-1; i>=0; i--){
2503     v    = aa + 4*diag[i] + 4;
2504     vi   = aj + diag[i] + 1;
2505     nz   = ai[i+1] - diag[i] - 1;
2506     idt  = 2*i;
2507     s1 = x[idt];  s2 = x[1+idt];
2508     while (nz--) {
2509       idx   = 2*(*vi++);
2510       x1    = x[idx];   x2 = x[1+idx];
2511       s1 -= v[0]*x1 + v[2]*x2;
2512       s2 -= v[1]*x1 + v[3]*x2;
2513       v    += 4;
2514     }
2515     v        = aa +  4*diag[i];
2516     x[idt]   = v[0]*s1 + v[2]*s2;
2517     x[1+idt] = v[1]*s1 + v[3]*s2;
2518   }
2519 
2520   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2521   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2522   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2523   PetscFunctionReturn(0);
2524 }
2525 
2526 #undef __FUNCT__
2527 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2528 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2529 {
2530   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2531   IS              iscol=a->col,isrow=a->row;
2532   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2533   int             *diag = a->diag;
2534   MatScalar       *aa=a->a,*v;
2535   PetscScalar     *x,*b,s1,*t;
2536 
2537   PetscFunctionBegin;
2538   if (!n) PetscFunctionReturn(0);
2539 
2540   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2542   t  = a->solve_work;
2543 
2544   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2545   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2546 
2547   /* forward solve the lower triangular */
2548   t[0] = b[*r++];
2549   for (i=1; i<n; i++) {
2550     v     = aa + ai[i];
2551     vi    = aj + ai[i];
2552     nz    = diag[i] - ai[i];
2553     s1  = b[*r++];
2554     while (nz--) {
2555       s1 -= (*v++)*t[*vi++];
2556     }
2557     t[i] = s1;
2558   }
2559   /* backward solve the upper triangular */
2560   for (i=n-1; i>=0; i--){
2561     v    = aa + diag[i] + 1;
2562     vi   = aj + diag[i] + 1;
2563     nz   = ai[i+1] - diag[i] - 1;
2564     s1 = t[i];
2565     while (nz--) {
2566       s1 -= (*v++)*t[*vi++];
2567     }
2568     x[*c--] = t[i] = aa[diag[i]]*s1;
2569   }
2570 
2571   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2572   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2573   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2574   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2575   PetscLogFlops(2*1*(a->nz) - A->n);
2576   PetscFunctionReturn(0);
2577 }
2578 /*
2579       Special case where the matrix was ILU(0) factored in the natural
2580    ordering. This eliminates the need for the column and row permutation.
2581 */
2582 #undef __FUNCT__
2583 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2584 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2585 {
2586   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2587   int             n=a->mbs,*ai=a->i,*aj=a->j;
2588   int             ierr,*diag = a->diag;
2589   MatScalar       *aa=a->a;
2590   PetscScalar     *x,*b;
2591   PetscScalar     s1,x1;
2592   MatScalar       *v;
2593   int             jdx,idt,idx,nz,*vi,i;
2594 
2595   PetscFunctionBegin;
2596   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2597   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2598 
2599   /* forward solve the lower triangular */
2600   idx    = 0;
2601   x[0]   = b[0];
2602   for (i=1; i<n; i++) {
2603     v     =  aa      + ai[i];
2604     vi    =  aj      + ai[i];
2605     nz    =  diag[i] - ai[i];
2606     idx   +=  1;
2607     s1  =  b[idx];
2608     while (nz--) {
2609       jdx   = *vi++;
2610       x1    = x[jdx];
2611       s1 -= v[0]*x1;
2612       v    += 1;
2613     }
2614     x[idx]   = s1;
2615   }
2616   /* backward solve the upper triangular */
2617   for (i=n-1; i>=0; i--){
2618     v    = aa + diag[i] + 1;
2619     vi   = aj + diag[i] + 1;
2620     nz   = ai[i+1] - diag[i] - 1;
2621     idt  = i;
2622     s1 = x[idt];
2623     while (nz--) {
2624       idx   = *vi++;
2625       x1    = x[idx];
2626       s1 -= v[0]*x1;
2627       v    += 1;
2628     }
2629     v        = aa +  diag[i];
2630     x[idt]   = v[0]*s1;
2631   }
2632   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2634   PetscLogFlops(2*(a->nz) - A->n);
2635   PetscFunctionReturn(0);
2636 }
2637 
2638 /* ----------------------------------------------------------------*/
2639 /*
2640      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2641    except that the data structure of Mat_SeqAIJ is slightly different.
2642    Not a good example of code reuse.
2643 */
2644 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2645 
2646 #undef __FUNCT__
2647 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2648 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2649 {
2650   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2651   IS          isicol;
2652   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2653   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2654   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2655   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2656   PetscTruth  col_identity,row_identity;
2657   PetscReal   f;
2658 
2659   PetscFunctionBegin;
2660   if (info) {
2661     f             = info->fill;
2662     levels        = (int)info->levels;
2663     diagonal_fill = (int)info->diagonal_fill;
2664   } else {
2665     f             = 1.0;
2666     levels        = 0;
2667     diagonal_fill = 0;
2668   }
2669   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2670   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2671   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2672 
2673   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2674     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2675     (*fact)->factor = FACTOR_LU;
2676     b               = (Mat_SeqBAIJ*)(*fact)->data;
2677     if (!b->diag) {
2678       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2679     }
2680     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2681     b->row        = isrow;
2682     b->col        = iscol;
2683     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2684     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2685     b->icol       = isicol;
2686     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2687   } else { /* general case perform the symbolic factorization */
2688     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2689     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2690 
2691     /* get new row pointers */
2692     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
2693     ainew[0] = 0;
2694     /* don't know how many column pointers are needed so estimate */
2695     jmax = (int)(f*ai[n] + 1);
2696     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
2697     /* ajfill is level of fill for each fill entry */
2698     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
2699     /* fill is a linked list of nonzeros in active row */
2700     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
2701     /* im is level for each filled value */
2702     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
2703     /* dloc is location of diagonal in factor */
2704     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
2705     dloc[0]  = 0;
2706     for (prow=0; prow<n; prow++) {
2707 
2708       /* copy prow into linked list */
2709       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2710       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2711       xi         = aj + ai[r[prow]];
2712       fill[n]    = n;
2713       fill[prow] = -1; /* marker for diagonal entry */
2714       while (nz--) {
2715 	fm  = n;
2716 	idx = ic[*xi++];
2717 	do {
2718 	  m  = fm;
2719 	  fm = fill[m];
2720 	} while (fm < idx);
2721 	fill[m]   = idx;
2722 	fill[idx] = fm;
2723 	im[idx]   = 0;
2724       }
2725 
2726       /* make sure diagonal entry is included */
2727       if (diagonal_fill && fill[prow] == -1) {
2728 	fm = n;
2729 	while (fill[fm] < prow) fm = fill[fm];
2730 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2731 	fill[fm]   = prow;
2732 	im[prow]   = 0;
2733 	nzf++;
2734 	dcount++;
2735       }
2736 
2737       nzi = 0;
2738       row = fill[n];
2739       while (row < prow) {
2740 	incrlev = im[row] + 1;
2741 	nz      = dloc[row];
2742 	xi      = ajnew  + ainew[row] + nz + 1;
2743 	flev    = ajfill + ainew[row] + nz + 1;
2744 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
2745 	fm      = row;
2746 	while (nnz-- > 0) {
2747 	  idx = *xi++;
2748 	  if (*flev + incrlev > levels) {
2749 	    flev++;
2750 	    continue;
2751 	  }
2752 	  do {
2753 	    m  = fm;
2754 	    fm = fill[m];
2755 	  } while (fm < idx);
2756 	  if (fm != idx) {
2757 	    im[idx]   = *flev + incrlev;
2758 	    fill[m]   = idx;
2759 	    fill[idx] = fm;
2760 	    fm        = idx;
2761 	    nzf++;
2762 	  } else {
2763 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2764 	  }
2765 	  flev++;
2766 	}
2767 	row = fill[row];
2768 	nzi++;
2769       }
2770       /* copy new filled row into permanent storage */
2771       ainew[prow+1] = ainew[prow] + nzf;
2772       if (ainew[prow+1] > jmax) {
2773 
2774 	/* estimate how much additional space we will need */
2775 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2776 	/* just double the memory each time */
2777 	int maxadd = jmax;
2778 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2779 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2780 	jmax += maxadd;
2781 
2782 	/* allocate a longer ajnew and ajfill */
2783 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2784 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2785 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
2786 	ajnew = xi;
2787 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2788 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2789 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
2790 	ajfill = xi;
2791 	realloc++; /* count how many reallocations are needed */
2792       }
2793       xi          = ajnew + ainew[prow];
2794       flev        = ajfill + ainew[prow];
2795       dloc[prow]  = nzi;
2796       fm          = fill[n];
2797       while (nzf--) {
2798 	*xi++   = fm;
2799 	*flev++ = im[fm];
2800 	fm      = fill[fm];
2801       }
2802       /* make sure row has diagonal entry */
2803       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2804 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2805     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2806       }
2807     }
2808     ierr = PetscFree(ajfill);CHKERRQ(ierr);
2809     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2810     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2811     ierr = PetscFree(fill);CHKERRQ(ierr);
2812     ierr = PetscFree(im);CHKERRQ(ierr);
2813 
2814     {
2815       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2816       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2817       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2818       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2819       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2820       if (diagonal_fill) {
2821 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2822       }
2823     }
2824 
2825     /* put together the new matrix */
2826     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2827     PetscLogObjectParent(*fact,isicol);
2828     b = (Mat_SeqBAIJ*)(*fact)->data;
2829     ierr = PetscFree(b->imax);CHKERRQ(ierr);
2830     b->singlemalloc = PETSC_FALSE;
2831     len = bs2*ainew[n]*sizeof(MatScalar);
2832     /* the next line frees the default space generated by the Create() */
2833     ierr = PetscFree(b->a);CHKERRQ(ierr);
2834     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2835     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
2836     b->j          = ajnew;
2837     b->i          = ainew;
2838     for (i=0; i<n; i++) dloc[i] += ainew[i];
2839     b->diag       = dloc;
2840     b->ilen       = 0;
2841     b->imax       = 0;
2842     b->row        = isrow;
2843     b->col        = iscol;
2844     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2845     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2846     b->icol       = isicol;
2847     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2848     /* In b structure:  Free imax, ilen, old a, old j.
2849        Allocate dloc, solve_work, new a, new j */
2850     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
2851     b->maxnz          = b->nz = ainew[n];
2852     (*fact)->factor   = FACTOR_LU;
2853 
2854     (*fact)->info.factor_mallocs    = realloc;
2855     (*fact)->info.fill_ratio_given  = f;
2856     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2857   }
2858 
2859   if (row_identity && col_identity) {
2860     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
2861   }
2862   PetscFunctionReturn(0);
2863 }
2864 
2865 #undef __FUNCT__
2866 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
2867 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
2868 {
2869   /*
2870       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2871       with natural ordering
2872   */
2873   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2874   PetscTruth  sse_enabled_local,sse_enabled_global;
2875   int         ierr;
2876 
2877   PetscFunctionBegin;
2878   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
2879   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
2880   switch (a->bs) {
2881   case 1:
2882     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2883     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
2884     break;
2885   case 2:
2886     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2887     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
2888     break;
2889   case 3:
2890     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2891     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
2892     break;
2893   case 4:
2894     ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
2895     if (sse_enabled_local) {
2896 #if defined(PETSC_HAVE_SSE)
2897       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2898       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
2899 #else
2900       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2901       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2902 #endif
2903     } else {
2904       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2905       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
2906     }
2907     break;
2908   case 5:
2909     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2910     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
2911     break;
2912   case 6:
2913     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2914     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
2915     break;
2916   case 7:
2917     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2918     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
2919     break;
2920   }
2921   PetscFunctionReturn(0);
2922 }
2923 
2924 #undef __FUNCT__
2925 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
2926 int MatSeqBAIJ_UpdateSolvers(Mat A)
2927 {
2928   /*
2929       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2930       with natural ordering
2931   */
2932   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2933   IS          row = a->row, col = a->col;
2934   PetscTruth  row_identity, col_identity;
2935   PetscTruth  sse_enabled_local, sse_enabled_global;
2936   PetscTruth  use_single, use_natural;
2937   int         ierr;
2938 
2939   PetscFunctionBegin;
2940 
2941   use_single  = PETSC_FALSE;
2942   use_natural = PETSC_FALSE;
2943 
2944   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
2945   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
2946 
2947   if (row_identity && col_identity) {
2948     use_natural = PETSC_TRUE;
2949   } else {
2950     use_natural = PETSC_FALSE;
2951   }
2952   switch (a->bs) {
2953   case 1:
2954     if (use_natural) {
2955       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
2956       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2957       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
2958       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2959     } else {
2960       A->ops->solve           = MatSolve_SeqBAIJ_1;
2961       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
2962     }
2963     break;
2964   case 2:
2965     if (use_natural) {
2966       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2967       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2968       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
2969       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2970     } else {
2971       A->ops->solve           = MatSolve_SeqBAIJ_2;
2972       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
2973     }
2974     break;
2975   case 3:
2976     if (use_natural) {
2977       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2978       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2979       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
2980       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2981     } else {
2982       A->ops->solve           = MatSolve_SeqBAIJ_3;
2983       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
2984     }
2985     break;
2986   case 4:
2987     ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
2988     if (sse_enabled_local) {
2989       PetscTruth single_prec,flg;
2990       single_prec = flg = PETSC_FALSE;
2991       ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr);
2992       if (flg) {
2993         a->single_precision_solves = single_prec;
2994       }
2995       if (a->single_precision_solves) {
2996         use_single = PETSC_TRUE;
2997       }
2998     } else {
2999       use_single = PETSC_FALSE;
3000     }
3001     if (use_natural) {
3002       if (use_single) {
3003 #if defined(PETSC_HAVE_SSE)
3004         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3005         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
3006 #else
3007       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3008       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3009 #endif
3010       } else {
3011         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3012         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3013       }
3014       A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3015       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3016     } else {
3017       if (use_single) {
3018 #if defined(PETSC_HAVE_SSE)
3019         A->ops->solve           = MatSolve_SeqBAIJ_4_SSE_Demotion;
3020         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3021 #else
3022       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3023       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3024 #endif
3025       } else {
3026         A->ops->solve           = MatSolve_SeqBAIJ_4;
3027       }
3028       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3029     }
3030     break;
3031   case 5:
3032     if (use_natural) {
3033       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3034       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3035       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3036       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3037     } else {
3038       A->ops->solve           = MatSolve_SeqBAIJ_5;
3039       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3040     }
3041     break;
3042   case 6:
3043     if (use_natural) {
3044       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3045       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3046       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3047       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3048     } else {
3049       A->ops->solve           = MatSolve_SeqBAIJ_6;
3050       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3051     }
3052     break;
3053   case 7:
3054     if (use_natural) {
3055       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3056       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3057       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3058       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3059     } else {
3060       A->ops->solve           = MatSolve_SeqBAIJ_7;
3061       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3062     }
3063     break;
3064   default:
3065     A->ops->solve             = MatSolve_SeqBAIJ_N;
3066     break;
3067   }
3068   PetscFunctionReturn(0);
3069 }
3070 
3071 #undef __FUNCT__
3072 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3073 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3074   int ierr;
3075 
3076   PetscFunctionBegin;
3077   ierr = MatSeqBAIJ_UpdateSolvers(A);
3078   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3079     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3080   } else {
3081     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3082   }
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 #undef __FUNCT__
3087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3088 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3089   int ierr;
3090 
3091   PetscFunctionBegin;
3092   ierr = MatSeqBAIJ_UpdateSolvers(A);
3093   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3094   PetscFunctionReturn(0);
3095 }
3096 
3097 
3098