xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision d4e9f3b6443ec535fe5e409f785fa3cf019ceb6c)
1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   PetscScalar     s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*x[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   PetscLogFlops(2*(a->nz) - A->n);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   PetscScalar     s1,s2,x1,x2;
64   PetscScalar     *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70 
71   /* forward solve the U^T */
72   idx = 0;
73   for (i=0; i<n; i++) {
74 
75     v     = aa + 4*diag[i];
76     /* multiply by the inverse of the block diagonal */
77     x1 = x[idx];   x2 = x[1+idx];
78     s1 = v[0]*x1  +  v[1]*x2;
79     s2 = v[2]*x1  +  v[3]*x2;
80     v += 4;
81 
82     vi    = aj + diag[i] + 1;
83     nz    = ai[i+1] - diag[i] - 1;
84     while (nz--) {
85       oidx = 2*(*vi++);
86       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88       v  += 4;
89     }
90     x[idx]   = s1;x[1+idx] = s2;
91     idx += 2;
92   }
93   /* backward solve the L^T */
94   for (i=n-1; i>=0; i--){
95     v    = aa + 4*diag[i] - 4;
96     vi   = aj + diag[i] - 1;
97     nz   = diag[i] - ai[i];
98     idt  = 2*i;
99     s1   = x[idt];  s2 = x[1+idt];
100     while (nz--) {
101       idx   = 2*(*vi--);
102       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104       v -= 4;
105     }
106   }
107   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110   PetscFunctionReturn(0);
111 }
112 
113 #undef __FUNCT__
114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116 {
117   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119   int             *diag = a->diag,oidx;
120   MatScalar       *aa=a->a,*v;
121   PetscScalar     s1,s2,s3,x1,x2,x3;
122   PetscScalar     *x,*b;
123 
124   PetscFunctionBegin;
125   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128 
129   /* forward solve the U^T */
130   idx = 0;
131   for (i=0; i<n; i++) {
132 
133     v     = aa + 9*diag[i];
134     /* multiply by the inverse of the block diagonal */
135     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139     v += 9;
140 
141     vi    = aj + diag[i] + 1;
142     nz    = ai[i+1] - diag[i] - 1;
143     while (nz--) {
144       oidx = 3*(*vi++);
145       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148       v  += 9;
149     }
150     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151     idx += 3;
152   }
153   /* backward solve the L^T */
154   for (i=n-1; i>=0; i--){
155     v    = aa + 9*diag[i] - 9;
156     vi   = aj + diag[i] - 1;
157     nz   = diag[i] - ai[i];
158     idt  = 3*i;
159     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160     while (nz--) {
161       idx   = 3*(*vi--);
162       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165       v -= 9;
166     }
167   }
168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171   PetscFunctionReturn(0);
172 }
173 
174 #undef __FUNCT__
175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177 {
178   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180   int             *diag = a->diag,oidx;
181   MatScalar       *aa=a->a,*v;
182   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
183   PetscScalar     *x,*b;
184 
185   PetscFunctionBegin;
186   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189 
190   /* forward solve the U^T */
191   idx = 0;
192   for (i=0; i<n; i++) {
193 
194     v     = aa + 16*diag[i];
195     /* multiply by the inverse of the block diagonal */
196     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201     v += 16;
202 
203     vi    = aj + diag[i] + 1;
204     nz    = ai[i+1] - diag[i] - 1;
205     while (nz--) {
206       oidx = 4*(*vi++);
207       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211       v  += 16;
212     }
213     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214     idx += 4;
215   }
216   /* backward solve the L^T */
217   for (i=n-1; i>=0; i--){
218     v    = aa + 16*diag[i] - 16;
219     vi   = aj + diag[i] - 1;
220     nz   = diag[i] - ai[i];
221     idt  = 4*i;
222     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223     while (nz--) {
224       idx   = 4*(*vi--);
225       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229       v -= 16;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244   int             *diag = a->diag,oidx;
245   MatScalar       *aa=a->a,*v;
246   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247   PetscScalar     *x,*b;
248 
249   PetscFunctionBegin;
250   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253 
254   /* forward solve the U^T */
255   idx = 0;
256   for (i=0; i<n; i++) {
257 
258     v     = aa + 25*diag[i];
259     /* multiply by the inverse of the block diagonal */
260     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266     v += 25;
267 
268     vi    = aj + diag[i] + 1;
269     nz    = ai[i+1] - diag[i] - 1;
270     while (nz--) {
271       oidx = 5*(*vi++);
272       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277       v  += 25;
278     }
279     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280     idx += 5;
281   }
282   /* backward solve the L^T */
283   for (i=n-1; i>=0; i--){
284     v    = aa + 25*diag[i] - 25;
285     vi   = aj + diag[i] - 1;
286     nz   = diag[i] - ai[i];
287     idt  = 5*i;
288     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289     while (nz--) {
290       idx   = 5*(*vi--);
291       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296       v -= 25;
297     }
298   }
299   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302   PetscFunctionReturn(0);
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308 {
309   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311   int             *diag = a->diag,oidx;
312   MatScalar       *aa=a->a,*v;
313   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314   PetscScalar     *x,*b;
315 
316   PetscFunctionBegin;
317   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320 
321   /* forward solve the U^T */
322   idx = 0;
323   for (i=0; i<n; i++) {
324 
325     v     = aa + 36*diag[i];
326     /* multiply by the inverse of the block diagonal */
327     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328     x6    = x[5+idx];
329     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335     v += 36;
336 
337     vi    = aj + diag[i] + 1;
338     nz    = ai[i+1] - diag[i] - 1;
339     while (nz--) {
340       oidx = 6*(*vi++);
341       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347       v  += 36;
348     }
349     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350     x[5+idx] = s6;
351     idx += 6;
352   }
353   /* backward solve the L^T */
354   for (i=n-1; i>=0; i--){
355     v    = aa + 36*diag[i] - 36;
356     vi   = aj + diag[i] - 1;
357     nz   = diag[i] - ai[i];
358     idt  = 6*i;
359     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360     s6 = x[5+idt];
361     while (nz--) {
362       idx   = 6*(*vi--);
363       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369       v -= 36;
370     }
371   }
372   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381 {
382   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384   int             *diag = a->diag,oidx;
385   MatScalar       *aa=a->a,*v;
386   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387   PetscScalar     *x,*b;
388 
389   PetscFunctionBegin;
390   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393 
394   /* forward solve the U^T */
395   idx = 0;
396   for (i=0; i<n; i++) {
397 
398     v     = aa + 49*diag[i];
399     /* multiply by the inverse of the block diagonal */
400     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401     x6    = x[5+idx]; x7 = x[6+idx];
402     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409     v += 49;
410 
411     vi    = aj + diag[i] + 1;
412     nz    = ai[i+1] - diag[i] - 1;
413     while (nz--) {
414       oidx = 7*(*vi++);
415       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422       v  += 49;
423     }
424     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425     x[5+idx] = s6;x[6+idx] = s7;
426     idx += 7;
427   }
428   /* backward solve the L^T */
429   for (i=n-1; i>=0; i--){
430     v    = aa + 49*diag[i] - 49;
431     vi   = aj + diag[i] - 1;
432     nz   = diag[i] - ai[i];
433     idt  = 7*i;
434     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435     s6 = x[5+idt];s7 = x[6+idt];
436     while (nz--) {
437       idx   = 7*(*vi--);
438       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445       v -= 49;
446     }
447   }
448   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451   PetscFunctionReturn(0);
452 }
453 
454 /*---------------------------------------------------------------------------------------------*/
455 #undef __FUNCT__
456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458 {
459   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460   IS              iscol=a->col,isrow=a->row;
461   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462   int             *diag = a->diag;
463   MatScalar       *aa=a->a,*v;
464   PetscScalar     s1,*x,*b,*t;
465 
466   PetscFunctionBegin;
467   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469   t  = a->solve_work;
470 
471   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473 
474   /* copy the b into temp work space according to permutation */
475   for (i=0; i<n; i++) {
476     t[i] = b[c[i]];
477   }
478 
479   /* forward solve the U^T */
480   for (i=0; i<n; i++) {
481 
482     v     = aa + diag[i];
483     /* multiply by the inverse of the block diagonal */
484     s1    = (*v++)*t[i];
485     vi    = aj + diag[i] + 1;
486     nz    = ai[i+1] - diag[i] - 1;
487     while (nz--) {
488       t[*vi++]  -= (*v++)*s1;
489     }
490     t[i]   = s1;
491   }
492   /* backward solve the L^T */
493   for (i=n-1; i>=0; i--){
494     v    = aa + diag[i] - 1;
495     vi   = aj + diag[i] - 1;
496     nz   = diag[i] - ai[i];
497     s1   = t[i];
498     while (nz--) {
499       t[*vi--]   -=  (*v--)*s1;
500     }
501   }
502 
503   /* copy t into x according to permutation */
504   for (i=0; i<n; i++) {
505     x[r[i]]   = t[i];
506   }
507 
508   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512   PetscLogFlops(2*(a->nz) - A->n);
513   PetscFunctionReturn(0);
514 }
515 
516 #undef __FUNCT__
517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519 {
520   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521   IS              iscol=a->col,isrow=a->row;
522   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523   int             *diag = a->diag,ii,ic,ir,oidx;
524   MatScalar       *aa=a->a,*v;
525   PetscScalar     s1,s2,x1,x2;
526   PetscScalar     *x,*b,*t;
527 
528   PetscFunctionBegin;
529   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531   t  = a->solve_work;
532 
533   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535 
536   /* copy the b into temp work space according to permutation */
537   ii = 0;
538   for (i=0; i<n; i++) {
539     ic      = 2*c[i];
540     t[ii]   = b[ic];
541     t[ii+1] = b[ic+1];
542     ii += 2;
543   }
544 
545   /* forward solve the U^T */
546   idx = 0;
547   for (i=0; i<n; i++) {
548 
549     v     = aa + 4*diag[i];
550     /* multiply by the inverse of the block diagonal */
551     x1    = t[idx];   x2 = t[1+idx];
552     s1 = v[0]*x1  +  v[1]*x2;
553     s2 = v[2]*x1  +  v[3]*x2;
554     v += 4;
555 
556     vi    = aj + diag[i] + 1;
557     nz    = ai[i+1] - diag[i] - 1;
558     while (nz--) {
559       oidx = 2*(*vi++);
560       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562       v  += 4;
563     }
564     t[idx]   = s1;t[1+idx] = s2;
565     idx += 2;
566   }
567   /* backward solve the L^T */
568   for (i=n-1; i>=0; i--){
569     v    = aa + 4*diag[i] - 4;
570     vi   = aj + diag[i] - 1;
571     nz   = diag[i] - ai[i];
572     idt  = 2*i;
573     s1 = t[idt];  s2 = t[1+idt];
574     while (nz--) {
575       idx   = 2*(*vi--);
576       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578       v -= 4;
579     }
580   }
581 
582   /* copy t into x according to permutation */
583   ii = 0;
584   for (i=0; i<n; i++) {
585     ir      = 2*r[i];
586     x[ir]   = t[ii];
587     x[ir+1] = t[ii+1];
588     ii += 2;
589   }
590 
591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602 {
603   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604   IS              iscol=a->col,isrow=a->row;
605   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606   int             *diag = a->diag,ii,ic,ir,oidx;
607   MatScalar       *aa=a->a,*v;
608   PetscScalar     s1,s2,s3,x1,x2,x3;
609   PetscScalar     *x,*b,*t;
610 
611   PetscFunctionBegin;
612   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614   t  = a->solve_work;
615 
616   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618 
619   /* copy the b into temp work space according to permutation */
620   ii = 0;
621   for (i=0; i<n; i++) {
622     ic      = 3*c[i];
623     t[ii]   = b[ic];
624     t[ii+1] = b[ic+1];
625     t[ii+2] = b[ic+2];
626     ii += 3;
627   }
628 
629   /* forward solve the U^T */
630   idx = 0;
631   for (i=0; i<n; i++) {
632 
633     v     = aa + 9*diag[i];
634     /* multiply by the inverse of the block diagonal */
635     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639     v += 9;
640 
641     vi    = aj + diag[i] + 1;
642     nz    = ai[i+1] - diag[i] - 1;
643     while (nz--) {
644       oidx = 3*(*vi++);
645       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648       v  += 9;
649     }
650     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651     idx += 3;
652   }
653   /* backward solve the L^T */
654   for (i=n-1; i>=0; i--){
655     v    = aa + 9*diag[i] - 9;
656     vi   = aj + diag[i] - 1;
657     nz   = diag[i] - ai[i];
658     idt  = 3*i;
659     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660     while (nz--) {
661       idx   = 3*(*vi--);
662       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665       v -= 9;
666     }
667   }
668 
669   /* copy t into x according to permutation */
670   ii = 0;
671   for (i=0; i<n; i++) {
672     ir      = 3*r[i];
673     x[ir]   = t[ii];
674     x[ir+1] = t[ii+1];
675     x[ir+2] = t[ii+2];
676     ii += 3;
677   }
678 
679   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690 {
691   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692   IS              iscol=a->col,isrow=a->row;
693   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694   int             *diag = a->diag,ii,ic,ir,oidx;
695   MatScalar       *aa=a->a,*v;
696   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
697   PetscScalar     *x,*b,*t;
698 
699   PetscFunctionBegin;
700   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702   t  = a->solve_work;
703 
704   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706 
707   /* copy the b into temp work space according to permutation */
708   ii = 0;
709   for (i=0; i<n; i++) {
710     ic      = 4*c[i];
711     t[ii]   = b[ic];
712     t[ii+1] = b[ic+1];
713     t[ii+2] = b[ic+2];
714     t[ii+3] = b[ic+3];
715     ii += 4;
716   }
717 
718   /* forward solve the U^T */
719   idx = 0;
720   for (i=0; i<n; i++) {
721 
722     v     = aa + 16*diag[i];
723     /* multiply by the inverse of the block diagonal */
724     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729     v += 16;
730 
731     vi    = aj + diag[i] + 1;
732     nz    = ai[i+1] - diag[i] - 1;
733     while (nz--) {
734       oidx = 4*(*vi++);
735       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739       v  += 16;
740     }
741     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742     idx += 4;
743   }
744   /* backward solve the L^T */
745   for (i=n-1; i>=0; i--){
746     v    = aa + 16*diag[i] - 16;
747     vi   = aj + diag[i] - 1;
748     nz   = diag[i] - ai[i];
749     idt  = 4*i;
750     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751     while (nz--) {
752       idx   = 4*(*vi--);
753       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757       v -= 16;
758     }
759   }
760 
761   /* copy t into x according to permutation */
762   ii = 0;
763   for (i=0; i<n; i++) {
764     ir      = 4*r[i];
765     x[ir]   = t[ii];
766     x[ir+1] = t[ii+1];
767     x[ir+2] = t[ii+2];
768     x[ir+3] = t[ii+3];
769     ii += 4;
770   }
771 
772   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783 {
784   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785   IS              iscol=a->col,isrow=a->row;
786   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787   int             *diag = a->diag,ii,ic,ir,oidx;
788   MatScalar       *aa=a->a,*v;
789   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790   PetscScalar     *x,*b,*t;
791 
792   PetscFunctionBegin;
793   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   t  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799 
800   /* copy the b into temp work space according to permutation */
801   ii = 0;
802   for (i=0; i<n; i++) {
803     ic      = 5*c[i];
804     t[ii]   = b[ic];
805     t[ii+1] = b[ic+1];
806     t[ii+2] = b[ic+2];
807     t[ii+3] = b[ic+3];
808     t[ii+4] = b[ic+4];
809     ii += 5;
810   }
811 
812   /* forward solve the U^T */
813   idx = 0;
814   for (i=0; i<n; i++) {
815 
816     v     = aa + 25*diag[i];
817     /* multiply by the inverse of the block diagonal */
818     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824     v += 25;
825 
826     vi    = aj + diag[i] + 1;
827     nz    = ai[i+1] - diag[i] - 1;
828     while (nz--) {
829       oidx = 5*(*vi++);
830       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835       v  += 25;
836     }
837     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838     idx += 5;
839   }
840   /* backward solve the L^T */
841   for (i=n-1; i>=0; i--){
842     v    = aa + 25*diag[i] - 25;
843     vi   = aj + diag[i] - 1;
844     nz   = diag[i] - ai[i];
845     idt  = 5*i;
846     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847     while (nz--) {
848       idx   = 5*(*vi--);
849       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v -= 25;
855     }
856   }
857 
858   /* copy t into x according to permutation */
859   ii = 0;
860   for (i=0; i<n; i++) {
861     ir      = 5*r[i];
862     x[ir]   = t[ii];
863     x[ir+1] = t[ii+1];
864     x[ir+2] = t[ii+2];
865     x[ir+3] = t[ii+3];
866     x[ir+4] = t[ii+4];
867     ii += 5;
868   }
869 
870   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNCT__
879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881 {
882   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883   IS              iscol=a->col,isrow=a->row;
884   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885   int             *diag = a->diag,ii,ic,ir,oidx;
886   MatScalar       *aa=a->a,*v;
887   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888   PetscScalar     *x,*b,*t;
889 
890   PetscFunctionBegin;
891   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893   t  = a->solve_work;
894 
895   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897 
898   /* copy the b into temp work space according to permutation */
899   ii = 0;
900   for (i=0; i<n; i++) {
901     ic      = 6*c[i];
902     t[ii]   = b[ic];
903     t[ii+1] = b[ic+1];
904     t[ii+2] = b[ic+2];
905     t[ii+3] = b[ic+3];
906     t[ii+4] = b[ic+4];
907     t[ii+5] = b[ic+5];
908     ii += 6;
909   }
910 
911   /* forward solve the U^T */
912   idx = 0;
913   for (i=0; i<n; i++) {
914 
915     v     = aa + 36*diag[i];
916     /* multiply by the inverse of the block diagonal */
917     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918     x6    = t[5+idx];
919     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925     v += 36;
926 
927     vi    = aj + diag[i] + 1;
928     nz    = ai[i+1] - diag[i] - 1;
929     while (nz--) {
930       oidx = 6*(*vi++);
931       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937       v  += 36;
938     }
939     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940     t[5+idx] = s6;
941     idx += 6;
942   }
943   /* backward solve the L^T */
944   for (i=n-1; i>=0; i--){
945     v    = aa + 36*diag[i] - 36;
946     vi   = aj + diag[i] - 1;
947     nz   = diag[i] - ai[i];
948     idt  = 6*i;
949     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950     s6 = t[5+idt];
951     while (nz--) {
952       idx   = 6*(*vi--);
953       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959       v -= 36;
960     }
961   }
962 
963   /* copy t into x according to permutation */
964   ii = 0;
965   for (i=0; i<n; i++) {
966     ir      = 6*r[i];
967     x[ir]   = t[ii];
968     x[ir+1] = t[ii+1];
969     x[ir+2] = t[ii+2];
970     x[ir+3] = t[ii+3];
971     x[ir+4] = t[ii+4];
972     x[ir+5] = t[ii+5];
973     ii += 6;
974   }
975 
976   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981   PetscFunctionReturn(0);
982 }
983 
984 #undef __FUNCT__
985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987 {
988   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989   IS              iscol=a->col,isrow=a->row;
990   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991   int             *diag = a->diag,ii,ic,ir,oidx;
992   MatScalar       *aa=a->a,*v;
993   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994   PetscScalar     *x,*b,*t;
995 
996   PetscFunctionBegin;
997   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999   t  = a->solve_work;
1000 
1001   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003 
1004   /* copy the b into temp work space according to permutation */
1005   ii = 0;
1006   for (i=0; i<n; i++) {
1007     ic      = 7*c[i];
1008     t[ii]   = b[ic];
1009     t[ii+1] = b[ic+1];
1010     t[ii+2] = b[ic+2];
1011     t[ii+3] = b[ic+3];
1012     t[ii+4] = b[ic+4];
1013     t[ii+5] = b[ic+5];
1014     t[ii+6] = b[ic+6];
1015     ii += 7;
1016   }
1017 
1018   /* forward solve the U^T */
1019   idx = 0;
1020   for (i=0; i<n; i++) {
1021 
1022     v     = aa + 49*diag[i];
1023     /* multiply by the inverse of the block diagonal */
1024     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025     x6    = t[5+idx]; x7 = t[6+idx];
1026     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033     v += 49;
1034 
1035     vi    = aj + diag[i] + 1;
1036     nz    = ai[i+1] - diag[i] - 1;
1037     while (nz--) {
1038       oidx = 7*(*vi++);
1039       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046       v  += 49;
1047     }
1048     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049     t[5+idx] = s6;t[6+idx] = s7;
1050     idx += 7;
1051   }
1052   /* backward solve the L^T */
1053   for (i=n-1; i>=0; i--){
1054     v    = aa + 49*diag[i] - 49;
1055     vi   = aj + diag[i] - 1;
1056     nz   = diag[i] - ai[i];
1057     idt  = 7*i;
1058     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059     s6 = t[5+idt];s7 = t[6+idt];
1060     while (nz--) {
1061       idx   = 7*(*vi--);
1062       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v -= 49;
1070     }
1071   }
1072 
1073   /* copy t into x according to permutation */
1074   ii = 0;
1075   for (i=0; i<n; i++) {
1076     ir      = 7*r[i];
1077     x[ir]   = t[ii];
1078     x[ir+1] = t[ii+1];
1079     x[ir+2] = t[ii+2];
1080     x[ir+3] = t[ii+3];
1081     x[ir+4] = t[ii+4];
1082     x[ir+5] = t[ii+5];
1083     x[ir+6] = t[ii+6];
1084     ii += 7;
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /* ----------------------------------------------------------- */
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1101   IS              iscol=a->col,isrow=a->row;
1102   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104   MatScalar       *aa=a->a,*v;
1105   PetscScalar     *x,*b,*s,*t,*ls;
1106 
1107   PetscFunctionBegin;
1108   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110   t  = a->solve_work;
1111 
1112   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1113   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1114 
1115   /* forward solve the lower triangular */
1116   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1117   for (i=1; i<n; i++) {
1118     v   = aa + bs2*ai[i];
1119     vi  = aj + ai[i];
1120     nz  = a->diag[i] - ai[i];
1121     s = t + bs*i;
1122     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1123     while (nz--) {
1124       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125       v += bs2;
1126     }
1127   }
1128   /* backward solve the upper triangular */
1129   ls = a->solve_work + A->n;
1130   for (i=n-1; i>=0; i--){
1131     v   = aa + bs2*(a->diag[i] + 1);
1132     vi  = aj + a->diag[i] + 1;
1133     nz  = ai[i+1] - a->diag[i] - 1;
1134     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1135     while (nz--) {
1136       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137       v += bs2;
1138     }
1139     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1141   }
1142 
1143   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1144   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 #undef __FUNCT__
1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154 {
1155   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1156   IS              iscol=a->col,isrow=a->row;
1157   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158   int             *diag = a->diag;
1159   MatScalar       *aa=a->a,*v;
1160   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161   PetscScalar     *x,*b,*t;
1162 
1163   PetscFunctionBegin;
1164   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166   t  = a->solve_work;
1167 
1168   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1169   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1170 
1171   /* forward solve the lower triangular */
1172   idx    = 7*(*r++);
1173   t[0] = b[idx];   t[1] = b[1+idx];
1174   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175   t[5] = b[5+idx]; t[6] = b[6+idx];
1176 
1177   for (i=1; i<n; i++) {
1178     v     = aa + 49*ai[i];
1179     vi    = aj + ai[i];
1180     nz    = diag[i] - ai[i];
1181     idx   = 7*(*r++);
1182     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184     while (nz--) {
1185       idx   = 7*(*vi++);
1186       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187       x4    = t[3+idx];x5 = t[4+idx];
1188       x6    = t[5+idx];x7 = t[6+idx];
1189       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196       v += 49;
1197     }
1198     idx = 7*i;
1199     t[idx]   = s1;t[1+idx] = s2;
1200     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201     t[5+idx] = s6;t[6+idx] = s7;
1202   }
1203   /* backward solve the upper triangular */
1204   for (i=n-1; i>=0; i--){
1205     v    = aa + 49*diag[i] + 49;
1206     vi   = aj + diag[i] + 1;
1207     nz   = ai[i+1] - diag[i] - 1;
1208     idt  = 7*i;
1209     s1 = t[idt];  s2 = t[1+idt];
1210     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211     s6 = t[5+idt];s7 = t[6+idt];
1212     while (nz--) {
1213       idx   = 7*(*vi++);
1214       x1    = t[idx];   x2 = t[1+idx];
1215       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216       x6    = t[5+idx]; x7 = t[6+idx];
1217       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224       v += 49;
1225     }
1226     idc = 7*(*c--);
1227     v   = aa + 49*diag[i];
1228     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242   }
1243 
1244   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1245   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 #undef __FUNCT__
1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255 {
1256   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1257   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258   int             ierr,*diag = a->diag,jdx;
1259   MatScalar       *aa=a->a,*v;
1260   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1261 
1262   PetscFunctionBegin;
1263   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1265   /* forward solve the lower triangular */
1266   idx    = 0;
1267   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1268   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269   x[6] = b[6+idx];
1270   for (i=1; i<n; i++) {
1271     v     =  aa + 49*ai[i];
1272     vi    =  aj + ai[i];
1273     nz    =  diag[i] - ai[i];
1274     idx   =  7*i;
1275     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277     s7  =  b[6+idx];
1278     while (nz--) {
1279       jdx   = 7*(*vi++);
1280       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1281       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282       x7    = x[6+jdx];
1283       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290       v += 49;
1291      }
1292     x[idx]   = s1;
1293     x[1+idx] = s2;
1294     x[2+idx] = s3;
1295     x[3+idx] = s4;
1296     x[4+idx] = s5;
1297     x[5+idx] = s6;
1298     x[6+idx] = s7;
1299   }
1300   /* backward solve the upper triangular */
1301   for (i=n-1; i>=0; i--){
1302     v    = aa + 49*diag[i] + 49;
1303     vi   = aj + diag[i] + 1;
1304     nz   = ai[i+1] - diag[i] - 1;
1305     idt  = 7*i;
1306     s1 = x[idt];   s2 = x[1+idt];
1307     s3 = x[2+idt]; s4 = x[3+idt];
1308     s5 = x[4+idt]; s6 = x[5+idt];
1309     s7 = x[6+idt];
1310     while (nz--) {
1311       idx   = 7*(*vi++);
1312       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1313       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314       x7    = x[6+idx];
1315       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322       v += 49;
1323     }
1324     v        = aa + 49*diag[i];
1325     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339   }
1340 
1341   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350 {
1351   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1352   IS              iscol=a->col,isrow=a->row;
1353   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354   int             *diag = a->diag;
1355   MatScalar       *aa=a->a,*v;
1356   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1357 
1358   PetscFunctionBegin;
1359   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361   t  = a->solve_work;
1362 
1363   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1364   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1365 
1366   /* forward solve the lower triangular */
1367   idx    = 6*(*r++);
1368   t[0] = b[idx];   t[1] = b[1+idx];
1369   t[2] = b[2+idx]; t[3] = b[3+idx];
1370   t[4] = b[4+idx]; t[5] = b[5+idx];
1371   for (i=1; i<n; i++) {
1372     v     = aa + 36*ai[i];
1373     vi    = aj + ai[i];
1374     nz    = diag[i] - ai[i];
1375     idx   = 6*(*r++);
1376     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377     s5  = b[4+idx]; s6 = b[5+idx];
1378     while (nz--) {
1379       idx   = 6*(*vi++);
1380       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388       v += 36;
1389     }
1390     idx = 6*i;
1391     t[idx]   = s1;t[1+idx] = s2;
1392     t[2+idx] = s3;t[3+idx] = s4;
1393     t[4+idx] = s5;t[5+idx] = s6;
1394   }
1395   /* backward solve the upper triangular */
1396   for (i=n-1; i>=0; i--){
1397     v    = aa + 36*diag[i] + 36;
1398     vi   = aj + diag[i] + 1;
1399     nz   = ai[i+1] - diag[i] - 1;
1400     idt  = 6*i;
1401     s1 = t[idt];  s2 = t[1+idt];
1402     s3 = t[2+idt];s4 = t[3+idt];
1403     s5 = t[4+idt];s6 = t[5+idt];
1404     while (nz--) {
1405       idx   = 6*(*vi++);
1406       x1    = t[idx];   x2 = t[1+idx];
1407       x3    = t[2+idx]; x4 = t[3+idx];
1408       x5    = t[4+idx]; x6 = t[5+idx];
1409       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415       v += 36;
1416     }
1417     idc = 6*(*c--);
1418     v   = aa + 36*diag[i];
1419     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430                                  v[23]*s4+v[29]*s5+v[35]*s6;
1431   }
1432 
1433   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1434   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1435   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1436   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 #undef __FUNCT__
1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1446   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447   int             ierr,*diag = a->diag,jdx;
1448   MatScalar       *aa=a->a,*v;
1449   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1450 
1451   PetscFunctionBegin;
1452   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1453   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1454   /* forward solve the lower triangular */
1455   idx    = 0;
1456   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1457   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458   for (i=1; i<n; i++) {
1459     v     =  aa + 36*ai[i];
1460     vi    =  aj + ai[i];
1461     nz    =  diag[i] - ai[i];
1462     idx   =  6*i;
1463     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465     while (nz--) {
1466       jdx   = 6*(*vi++);
1467       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1468       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475       v += 36;
1476      }
1477     x[idx]   = s1;
1478     x[1+idx] = s2;
1479     x[2+idx] = s3;
1480     x[3+idx] = s4;
1481     x[4+idx] = s5;
1482     x[5+idx] = s6;
1483   }
1484   /* backward solve the upper triangular */
1485   for (i=n-1; i>=0; i--){
1486     v    = aa + 36*diag[i] + 36;
1487     vi   = aj + diag[i] + 1;
1488     nz   = ai[i+1] - diag[i] - 1;
1489     idt  = 6*i;
1490     s1 = x[idt];   s2 = x[1+idt];
1491     s3 = x[2+idt]; s4 = x[3+idt];
1492     s5 = x[4+idt]; s6 = x[5+idt];
1493     while (nz--) {
1494       idx   = 6*(*vi++);
1495       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1496       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503       v += 36;
1504     }
1505     v        = aa + 36*diag[i];
1506     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512   }
1513 
1514   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 #undef __FUNCT__
1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523 {
1524   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1525   IS              iscol=a->col,isrow=a->row;
1526   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527   int             *diag = a->diag;
1528   MatScalar       *aa=a->a,*v;
1529   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534   t  = a->solve_work;
1535 
1536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1538 
1539   /* forward solve the lower triangular */
1540   idx    = 5*(*r++);
1541   t[0] = b[idx];   t[1] = b[1+idx];
1542   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543   for (i=1; i<n; i++) {
1544     v     = aa + 25*ai[i];
1545     vi    = aj + ai[i];
1546     nz    = diag[i] - ai[i];
1547     idx   = 5*(*r++);
1548     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549     s5  = b[4+idx];
1550     while (nz--) {
1551       idx   = 5*(*vi++);
1552       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553       x4    = t[3+idx];x5 = t[4+idx];
1554       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559       v += 25;
1560     }
1561     idx = 5*i;
1562     t[idx]   = s1;t[1+idx] = s2;
1563     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564   }
1565   /* backward solve the upper triangular */
1566   for (i=n-1; i>=0; i--){
1567     v    = aa + 25*diag[i] + 25;
1568     vi   = aj + diag[i] + 1;
1569     nz   = ai[i+1] - diag[i] - 1;
1570     idt  = 5*i;
1571     s1 = t[idt];  s2 = t[1+idt];
1572     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573     while (nz--) {
1574       idx   = 5*(*vi++);
1575       x1    = t[idx];   x2 = t[1+idx];
1576       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582       v += 25;
1583     }
1584     idc = 5*(*c--);
1585     v   = aa + 25*diag[i];
1586     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587                                  v[15]*s4+v[20]*s5;
1588     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589                                  v[16]*s4+v[21]*s5;
1590     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591                                  v[17]*s4+v[22]*s5;
1592     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593                                  v[18]*s4+v[23]*s5;
1594     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595                                  v[19]*s4+v[24]*s5;
1596   }
1597 
1598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 #undef __FUNCT__
1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609 {
1610   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1611   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612   int             ierr,*diag = a->diag,jdx;
1613   MatScalar       *aa=a->a,*v;
1614   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1615 
1616   PetscFunctionBegin;
1617   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1618   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1619   /* forward solve the lower triangular */
1620   idx    = 0;
1621   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622   for (i=1; i<n; i++) {
1623     v     =  aa + 25*ai[i];
1624     vi    =  aj + ai[i];
1625     nz    =  diag[i] - ai[i];
1626     idx   =  5*i;
1627     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628     while (nz--) {
1629       jdx   = 5*(*vi++);
1630       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1636       v    += 25;
1637     }
1638     x[idx]   = s1;
1639     x[1+idx] = s2;
1640     x[2+idx] = s3;
1641     x[3+idx] = s4;
1642     x[4+idx] = s5;
1643   }
1644   /* backward solve the upper triangular */
1645   for (i=n-1; i>=0; i--){
1646     v    = aa + 25*diag[i] + 25;
1647     vi   = aj + diag[i] + 1;
1648     nz   = ai[i+1] - diag[i] - 1;
1649     idt  = 5*i;
1650     s1 = x[idt];  s2 = x[1+idt];
1651     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652     while (nz--) {
1653       idx   = 5*(*vi++);
1654       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1660       v    += 25;
1661     }
1662     v        = aa + 25*diag[i];
1663     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1668   }
1669 
1670   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1671   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 #undef __FUNCT__
1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679 {
1680   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1681   IS              iscol=a->col,isrow=a->row;
1682   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683   int             *diag = a->diag;
1684   MatScalar       *aa=a->a,*v;
1685   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1686 
1687   PetscFunctionBegin;
1688   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690   t  = a->solve_work;
1691 
1692   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1693   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1694 
1695   /* forward solve the lower triangular */
1696   idx    = 4*(*r++);
1697   t[0] = b[idx];   t[1] = b[1+idx];
1698   t[2] = b[2+idx]; t[3] = b[3+idx];
1699   for (i=1; i<n; i++) {
1700     v     = aa + 16*ai[i];
1701     vi    = aj + ai[i];
1702     nz    = diag[i] - ai[i];
1703     idx   = 4*(*r++);
1704     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705     while (nz--) {
1706       idx   = 4*(*vi++);
1707       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712       v    += 16;
1713     }
1714     idx        = 4*i;
1715     t[idx]   = s1;t[1+idx] = s2;
1716     t[2+idx] = s3;t[3+idx] = s4;
1717   }
1718   /* backward solve the upper triangular */
1719   for (i=n-1; i>=0; i--){
1720     v    = aa + 16*diag[i] + 16;
1721     vi   = aj + diag[i] + 1;
1722     nz   = ai[i+1] - diag[i] - 1;
1723     idt  = 4*i;
1724     s1 = t[idt];  s2 = t[1+idt];
1725     s3 = t[2+idt];s4 = t[3+idt];
1726     while (nz--) {
1727       idx   = 4*(*vi++);
1728       x1    = t[idx];   x2 = t[1+idx];
1729       x3    = t[2+idx]; x4 = t[3+idx];
1730       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1734       v += 16;
1735     }
1736     idc      = 4*(*c--);
1737     v        = aa + 16*diag[i];
1738     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742   }
1743 
1744   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1745   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 #undef __FUNCT__
1753 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1754 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755 {
1756   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757   IS              iscol=a->col,isrow=a->row;
1758   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759   int             *diag = a->diag;
1760   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761   PetscScalar     *x,*b;
1762 
1763   PetscFunctionBegin;
1764   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1765   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1766   t  = (MatScalar *)a->solve_work;
1767 
1768   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1769   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1770 
1771   /* forward solve the lower triangular */
1772   idx    = 4*(*r++);
1773   t[0] = (MatScalar)b[idx];
1774   t[1] = (MatScalar)b[1+idx];
1775   t[2] = (MatScalar)b[2+idx];
1776   t[3] = (MatScalar)b[3+idx];
1777   for (i=1; i<n; i++) {
1778     v     = aa + 16*ai[i];
1779     vi    = aj + ai[i];
1780     nz    = diag[i] - ai[i];
1781     idx   = 4*(*r++);
1782     s1 = (MatScalar)b[idx];
1783     s2 = (MatScalar)b[1+idx];
1784     s3 = (MatScalar)b[2+idx];
1785     s4 = (MatScalar)b[3+idx];
1786     while (nz--) {
1787       idx   = 4*(*vi++);
1788       x1  = t[idx];
1789       x2  = t[1+idx];
1790       x3  = t[2+idx];
1791       x4  = t[3+idx];
1792       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796       v    += 16;
1797     }
1798     idx        = 4*i;
1799     t[idx]   = s1;
1800     t[1+idx] = s2;
1801     t[2+idx] = s3;
1802     t[3+idx] = s4;
1803   }
1804   /* backward solve the upper triangular */
1805   for (i=n-1; i>=0; i--){
1806     v    = aa + 16*diag[i] + 16;
1807     vi   = aj + diag[i] + 1;
1808     nz   = ai[i+1] - diag[i] - 1;
1809     idt  = 4*i;
1810     s1 = t[idt];
1811     s2 = t[1+idt];
1812     s3 = t[2+idt];
1813     s4 = t[3+idt];
1814     while (nz--) {
1815       idx   = 4*(*vi++);
1816       x1  = t[idx];
1817       x2  = t[1+idx];
1818       x3  = t[2+idx];
1819       x4  = t[3+idx];
1820       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824       v += 16;
1825     }
1826     idc      = 4*(*c--);
1827     v        = aa + 16*diag[i];
1828     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832     x[idc]   = (PetscScalar)t[idt];
1833     x[1+idc] = (PetscScalar)t[1+idt];
1834     x[2+idc] = (PetscScalar)t[2+idt];
1835     x[3+idc] = (PetscScalar)t[3+idt];
1836  }
1837 
1838   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1839   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1840   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1841   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1842   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 #if defined (PETSC_HAVE_SSE)
1847 
1848 #include PETSC_HAVE_SSE
1849 
1850 #undef __FUNCT__
1851 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1852 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1853 {
1854   /*
1855      Note: This code uses demotion of double
1856      to float when performing the mixed-mode computation.
1857      This may not be numerically reasonable for all applications.
1858   */
1859   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1860   IS              iscol=a->col,isrow=a->row;
1861   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1862   int             *diag = a->diag,ai16;
1863   MatScalar       *aa=a->a,*v;
1864   PetscScalar     *x,*b,*t;
1865 
1866   /* Make space in temp stack for 16 Byte Aligned arrays */
1867   float           ssealignedspace[11],*tmps,*tmpx;
1868   unsigned long   offset;
1869 
1870   PetscFunctionBegin;
1871   SSE_SCOPE_BEGIN;
1872 
1873     offset = (unsigned long)ssealignedspace % 16;
1874     if (offset) offset = (16 - offset)/4;
1875     tmps = &ssealignedspace[offset];
1876     tmpx = &ssealignedspace[offset+4];
1877     PREFETCH_NTA(aa+16*ai[1]);
1878 
1879     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1880     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1881     t  = a->solve_work;
1882 
1883     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1884     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1885 
1886     /* forward solve the lower triangular */
1887     idx  = 4*(*r++);
1888     t[0] = b[idx];   t[1] = b[1+idx];
1889     t[2] = b[2+idx]; t[3] = b[3+idx];
1890     v    =  aa + 16*ai[1];
1891 
1892     for (i=1; i<n;) {
1893       PREFETCH_NTA(&v[8]);
1894       vi   =  aj      + ai[i];
1895       nz   =  diag[i] - ai[i];
1896       idx  =  4*(*r++);
1897 
1898       /* Demote sum from double to float */
1899       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1900       LOAD_PS(tmps,XMM7);
1901 
1902       while (nz--) {
1903         PREFETCH_NTA(&v[16]);
1904         idx = 4*(*vi++);
1905 
1906         /* Demote solution (so far) from double to float */
1907         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1908 
1909         /* 4x4 Matrix-Vector product with negative accumulation: */
1910         SSE_INLINE_BEGIN_2(tmpx,v)
1911           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1912 
1913           /* First Column */
1914           SSE_COPY_PS(XMM0,XMM6)
1915           SSE_SHUFFLE(XMM0,XMM0,0x00)
1916           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1917           SSE_SUB_PS(XMM7,XMM0)
1918 
1919           /* Second Column */
1920           SSE_COPY_PS(XMM1,XMM6)
1921           SSE_SHUFFLE(XMM1,XMM1,0x55)
1922           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1923           SSE_SUB_PS(XMM7,XMM1)
1924 
1925           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1926 
1927           /* Third Column */
1928           SSE_COPY_PS(XMM2,XMM6)
1929           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1930           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1931           SSE_SUB_PS(XMM7,XMM2)
1932 
1933           /* Fourth Column */
1934           SSE_COPY_PS(XMM3,XMM6)
1935           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1936           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1937           SSE_SUB_PS(XMM7,XMM3)
1938         SSE_INLINE_END_2
1939 
1940         v  += 16;
1941       }
1942       idx = 4*i;
1943       v   = aa + 16*ai[++i];
1944       PREFETCH_NTA(v);
1945       STORE_PS(tmps,XMM7);
1946 
1947       /* Promote result from float to double */
1948       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1949     }
1950     /* backward solve the upper triangular */
1951     idt  = 4*(n-1);
1952     ai16 = 16*diag[n-1];
1953     v    = aa + ai16 + 16;
1954     for (i=n-1; i>=0;){
1955       PREFETCH_NTA(&v[8]);
1956       vi = aj + diag[i] + 1;
1957       nz = ai[i+1] - diag[i] - 1;
1958 
1959       /* Demote accumulator from double to float */
1960       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1961       LOAD_PS(tmps,XMM7);
1962 
1963       while (nz--) {
1964         PREFETCH_NTA(&v[16]);
1965         idx = 4*(*vi++);
1966 
1967         /* Demote solution (so far) from double to float */
1968         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1969 
1970         /* 4x4 Matrix-Vector Product with negative accumulation: */
1971         SSE_INLINE_BEGIN_2(tmpx,v)
1972           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1973 
1974           /* First Column */
1975           SSE_COPY_PS(XMM0,XMM6)
1976           SSE_SHUFFLE(XMM0,XMM0,0x00)
1977           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1978           SSE_SUB_PS(XMM7,XMM0)
1979 
1980           /* Second Column */
1981           SSE_COPY_PS(XMM1,XMM6)
1982           SSE_SHUFFLE(XMM1,XMM1,0x55)
1983           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1984           SSE_SUB_PS(XMM7,XMM1)
1985 
1986           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1987 
1988           /* Third Column */
1989           SSE_COPY_PS(XMM2,XMM6)
1990           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1991           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1992           SSE_SUB_PS(XMM7,XMM2)
1993 
1994           /* Fourth Column */
1995           SSE_COPY_PS(XMM3,XMM6)
1996           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1997           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1998           SSE_SUB_PS(XMM7,XMM3)
1999         SSE_INLINE_END_2
2000         v  += 16;
2001       }
2002       v    = aa + ai16;
2003       ai16 = 16*diag[--i];
2004       PREFETCH_NTA(aa+ai16+16);
2005       /*
2006          Scale the result by the diagonal 4x4 block,
2007          which was inverted as part of the factorization
2008       */
2009       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2010         /* First Column */
2011         SSE_COPY_PS(XMM0,XMM7)
2012         SSE_SHUFFLE(XMM0,XMM0,0x00)
2013         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2014 
2015         /* Second Column */
2016         SSE_COPY_PS(XMM1,XMM7)
2017         SSE_SHUFFLE(XMM1,XMM1,0x55)
2018         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2019         SSE_ADD_PS(XMM0,XMM1)
2020 
2021         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2022 
2023         /* Third Column */
2024         SSE_COPY_PS(XMM2,XMM7)
2025         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2026         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2027         SSE_ADD_PS(XMM0,XMM2)
2028 
2029         /* Fourth Column */
2030         SSE_COPY_PS(XMM3,XMM7)
2031         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2032         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2033         SSE_ADD_PS(XMM0,XMM3)
2034 
2035         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2036       SSE_INLINE_END_3
2037 
2038       /* Promote solution from float to double */
2039       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2040 
2041       /* Apply reordering to t and stream into x.    */
2042       /* This way, x doesn't pollute the cache.      */
2043       /* Be careful with size: 2 doubles = 4 floats! */
2044       idc  = 4*(*c--);
2045       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2046         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2047         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2048         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2049         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2050         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2051         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2052       SSE_INLINE_END_2
2053       v    = aa + ai16 + 16;
2054       idt -= 4;
2055     }
2056 
2057     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2058     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2059     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2060     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2061     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2062   SSE_SCOPE_END;
2063   PetscFunctionReturn(0);
2064 }
2065 
2066 #endif
2067 
2068 
2069 /*
2070       Special case where the matrix was ILU(0) factored in the natural
2071    ordering. This eliminates the need for the column and row permutation.
2072 */
2073 #undef __FUNCT__
2074 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2075 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2076 {
2077   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2078   int             n=a->mbs,*ai=a->i,*aj=a->j;
2079   int             ierr,*diag = a->diag;
2080   MatScalar       *aa=a->a;
2081   PetscScalar     *x,*b;
2082 
2083   PetscFunctionBegin;
2084   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2085   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2086 
2087 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2088   {
2089     static PetscScalar w[2000]; /* very BAD need to fix */
2090     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2091   }
2092 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2093   {
2094     static PetscScalar w[2000]; /* very BAD need to fix */
2095     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2096   }
2097 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2098   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099 #else
2100   {
2101     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2102     MatScalar    *v;
2103     int          jdx,idt,idx,nz,*vi,i,ai16;
2104 
2105   /* forward solve the lower triangular */
2106   idx    = 0;
2107   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2108   for (i=1; i<n; i++) {
2109     v     =  aa      + 16*ai[i];
2110     vi    =  aj      + ai[i];
2111     nz    =  diag[i] - ai[i];
2112     idx   +=  4;
2113     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2114     while (nz--) {
2115       jdx   = 4*(*vi++);
2116       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2121       v    += 16;
2122     }
2123     x[idx]   = s1;
2124     x[1+idx] = s2;
2125     x[2+idx] = s3;
2126     x[3+idx] = s4;
2127   }
2128   /* backward solve the upper triangular */
2129   idt = 4*(n-1);
2130   for (i=n-1; i>=0; i--){
2131     ai16 = 16*diag[i];
2132     v    = aa + ai16 + 16;
2133     vi   = aj + diag[i] + 1;
2134     nz   = ai[i+1] - diag[i] - 1;
2135     s1 = x[idt];  s2 = x[1+idt];
2136     s3 = x[2+idt];s4 = x[3+idt];
2137     while (nz--) {
2138       idx   = 4*(*vi++);
2139       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2144       v    += 16;
2145     }
2146     v        = aa + ai16;
2147     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151     idt -= 4;
2152   }
2153   }
2154 #endif
2155 
2156   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2157   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2158   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 #undef __FUNCT__
2163 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2164 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165 {
2166   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168   int             ierr,*diag = a->diag;
2169   MatScalar       *aa=a->a;
2170   PetscScalar     *x,*b;
2171 
2172   PetscFunctionBegin;
2173   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2174   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2175 
2176   {
2177     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178     MatScalar  *v,*t=(MatScalar *)x;
2179     int        jdx,idt,idx,nz,*vi,i,ai16;
2180 
2181     /* forward solve the lower triangular */
2182     idx  = 0;
2183     t[0] = (MatScalar)b[0];
2184     t[1] = (MatScalar)b[1];
2185     t[2] = (MatScalar)b[2];
2186     t[3] = (MatScalar)b[3];
2187     for (i=1; i<n; i++) {
2188       v     =  aa      + 16*ai[i];
2189       vi    =  aj      + ai[i];
2190       nz    =  diag[i] - ai[i];
2191       idx   +=  4;
2192       s1 = (MatScalar)b[idx];
2193       s2 = (MatScalar)b[1+idx];
2194       s3 = (MatScalar)b[2+idx];
2195       s4 = (MatScalar)b[3+idx];
2196       while (nz--) {
2197         jdx = 4*(*vi++);
2198         x1  = t[jdx];
2199         x2  = t[1+jdx];
2200         x3  = t[2+jdx];
2201         x4  = t[3+jdx];
2202         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206         v    += 16;
2207       }
2208       t[idx]   = s1;
2209       t[1+idx] = s2;
2210       t[2+idx] = s3;
2211       t[3+idx] = s4;
2212     }
2213     /* backward solve the upper triangular */
2214     idt = 4*(n-1);
2215     for (i=n-1; i>=0; i--){
2216       ai16 = 16*diag[i];
2217       v    = aa + ai16 + 16;
2218       vi   = aj + diag[i] + 1;
2219       nz   = ai[i+1] - diag[i] - 1;
2220       s1   = t[idt];
2221       s2   = t[1+idt];
2222       s3   = t[2+idt];
2223       s4   = t[3+idt];
2224       while (nz--) {
2225         idx = 4*(*vi++);
2226         x1  = (MatScalar)x[idx];
2227         x2  = (MatScalar)x[1+idx];
2228         x3  = (MatScalar)x[2+idx];
2229         x4  = (MatScalar)x[3+idx];
2230         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234         v    += 16;
2235       }
2236       v        = aa + ai16;
2237       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241       idt -= 4;
2242     }
2243   }
2244 
2245   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2247   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248   PetscFunctionReturn(0);
2249 }
2250 
2251 #if defined (PETSC_HAVE_SSE)
2252 
2253 #include PETSC_HAVE_SSE
2254 #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */
2255 #undef __FUNCT__
2256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2257 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2258 {
2259   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2260   unsigned short *aj=(unsigned short *)a->j;
2261   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2262   MatScalar      *aa=a->a;
2263   PetscScalar    *x,*b;
2264 
2265   PetscFunctionBegin;
2266   SSE_SCOPE_BEGIN;
2267   /*
2268      Note: This code currently uses demotion of double
2269      to float when performing the mixed-mode computation.
2270      This may not be numerically reasonable for all applications.
2271   */
2272   PREFETCH_NTA(aa+16*ai[1]);
2273 
2274   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2275   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2276   {
2277     /* x will first be computed in single precision then promoted inplace to double */
2278     MatScalar      *v,*t=(MatScalar *)x;
2279     int            nz,i,idt,ai16;
2280     unsigned int   jdx,idx;
2281     unsigned short *vi;
2282     /* Forward solve the lower triangular factor. */
2283 
2284     /* First block is the identity. */
2285     idx  = 0;
2286     CONVERT_DOUBLE4_FLOAT4(t,b);
2287     v    =  aa + 16*((unsigned int)ai[1]);
2288 
2289     for (i=1; i<n;) {
2290       PREFETCH_NTA(&v[8]);
2291       vi   =  aj      + ai[i];
2292       nz   =  diag[i] - ai[i];
2293       idx +=  4;
2294 
2295       /* Demote RHS from double to float. */
2296       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2297       LOAD_PS(&t[idx],XMM7);
2298 
2299       while (nz--) {
2300         PREFETCH_NTA(&v[16]);
2301         jdx = 4*((unsigned int)(*vi++));
2302 
2303         /* 4x4 Matrix-Vector product with negative accumulation: */
2304         SSE_INLINE_BEGIN_2(&t[jdx],v)
2305           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2306 
2307           /* First Column */
2308           SSE_COPY_PS(XMM0,XMM6)
2309           SSE_SHUFFLE(XMM0,XMM0,0x00)
2310           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2311           SSE_SUB_PS(XMM7,XMM0)
2312 
2313           /* Second Column */
2314           SSE_COPY_PS(XMM1,XMM6)
2315           SSE_SHUFFLE(XMM1,XMM1,0x55)
2316           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2317           SSE_SUB_PS(XMM7,XMM1)
2318 
2319           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2320 
2321           /* Third Column */
2322           SSE_COPY_PS(XMM2,XMM6)
2323           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2324           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2325           SSE_SUB_PS(XMM7,XMM2)
2326 
2327           /* Fourth Column */
2328           SSE_COPY_PS(XMM3,XMM6)
2329           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2330           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2331           SSE_SUB_PS(XMM7,XMM3)
2332         SSE_INLINE_END_2
2333 
2334         v  += 16;
2335       }
2336       v    =  aa + 16*ai[++i];
2337       PREFETCH_NTA(v);
2338       STORE_PS(&t[idx],XMM7);
2339     }
2340 
2341     /* Backward solve the upper triangular factor.*/
2342 
2343     idt  = 4*(n-1);
2344     ai16 = 16*diag[n-1];
2345     v    = aa + ai16 + 16;
2346     for (i=n-1; i>=0;){
2347       PREFETCH_NTA(&v[8]);
2348       vi = aj + diag[i] + 1;
2349       nz = ai[i+1] - diag[i] - 1;
2350 
2351       LOAD_PS(&t[idt],XMM7);
2352 
2353       while (nz--) {
2354         PREFETCH_NTA(&v[16]);
2355         idx = 4*((unsigned int)(*vi++));
2356 
2357         /* 4x4 Matrix-Vector Product with negative accumulation: */
2358         SSE_INLINE_BEGIN_2(&t[idx],v)
2359           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2360 
2361           /* First Column */
2362           SSE_COPY_PS(XMM0,XMM6)
2363           SSE_SHUFFLE(XMM0,XMM0,0x00)
2364           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2365           SSE_SUB_PS(XMM7,XMM0)
2366 
2367           /* Second Column */
2368           SSE_COPY_PS(XMM1,XMM6)
2369           SSE_SHUFFLE(XMM1,XMM1,0x55)
2370           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2371           SSE_SUB_PS(XMM7,XMM1)
2372 
2373           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2374 
2375           /* Third Column */
2376           SSE_COPY_PS(XMM2,XMM6)
2377           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2378           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2379           SSE_SUB_PS(XMM7,XMM2)
2380 
2381           /* Fourth Column */
2382           SSE_COPY_PS(XMM3,XMM6)
2383           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2384           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2385           SSE_SUB_PS(XMM7,XMM3)
2386         SSE_INLINE_END_2
2387         v  += 16;
2388       }
2389       v    = aa + ai16;
2390       ai16 = 16*diag[--i];
2391       PREFETCH_NTA(aa+ai16+16);
2392       /*
2393          Scale the result by the diagonal 4x4 block,
2394          which was inverted as part of the factorization
2395       */
2396       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2397         /* First Column */
2398         SSE_COPY_PS(XMM0,XMM7)
2399         SSE_SHUFFLE(XMM0,XMM0,0x00)
2400         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2401 
2402         /* Second Column */
2403         SSE_COPY_PS(XMM1,XMM7)
2404         SSE_SHUFFLE(XMM1,XMM1,0x55)
2405         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2406         SSE_ADD_PS(XMM0,XMM1)
2407 
2408         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2409 
2410         /* Third Column */
2411         SSE_COPY_PS(XMM2,XMM7)
2412         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2413         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2414         SSE_ADD_PS(XMM0,XMM2)
2415 
2416         /* Fourth Column */
2417         SSE_COPY_PS(XMM3,XMM7)
2418         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2419         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2420         SSE_ADD_PS(XMM0,XMM3)
2421 
2422         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2423       SSE_INLINE_END_3
2424 
2425       v    = aa + ai16 + 16;
2426       idt -= 4;
2427     }
2428 
2429     /* Convert t from single precision back to double precision (inplace)*/
2430     idt = 4*(n-1);
2431     for (i=n-1;i>=0;i--) {
2432       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2433       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2434       PetscScalar *xtemp=&x[idt];
2435       MatScalar   *ttemp=&t[idt];
2436       xtemp[3] = (PetscScalar)ttemp[3];
2437       xtemp[2] = (PetscScalar)ttemp[2];
2438       xtemp[1] = (PetscScalar)ttemp[1];
2439       xtemp[0] = (PetscScalar)ttemp[0];
2440       idt -= 4;
2441     }
2442 
2443   } /* End of artificial scope. */
2444   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2445   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2446   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2447   SSE_SCOPE_END;
2448   PetscFunctionReturn(0);
2449 }
2450 
2451 #undef __FUNCT__
2452 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2453 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2454 {
2455   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2456   int            *aj=a->j;
2457   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2458   MatScalar      *aa=a->a;
2459   PetscScalar    *x,*b;
2460 
2461   PetscFunctionBegin;
2462   SSE_SCOPE_BEGIN;
2463   /*
2464      Note: This code currently uses demotion of double
2465      to float when performing the mixed-mode computation.
2466      This may not be numerically reasonable for all applications.
2467   */
2468   PREFETCH_NTA(aa+16*ai[1]);
2469 
2470   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2471   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2472   {
2473     /* x will first be computed in single precision then promoted inplace to double */
2474     MatScalar *v,*t=(MatScalar *)x;
2475     int       nz,i,idt,ai16;
2476     int       jdx,idx;
2477     int       *vi;
2478     /* Forward solve the lower triangular factor. */
2479 
2480     /* First block is the identity. */
2481     idx  = 0;
2482     CONVERT_DOUBLE4_FLOAT4(t,b);
2483     v    =  aa + 16*ai[1];
2484 
2485     for (i=1; i<n;) {
2486       PREFETCH_NTA(&v[8]);
2487       vi   =  aj      + ai[i];
2488       nz   =  diag[i] - ai[i];
2489       idx +=  4;
2490 
2491       /* Demote RHS from double to float. */
2492       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2493       LOAD_PS(&t[idx],XMM7);
2494 
2495       while (nz--) {
2496         PREFETCH_NTA(&v[16]);
2497         jdx = 4*(*vi++);
2498 /*          jdx = *vi++; */
2499 
2500         /* 4x4 Matrix-Vector product with negative accumulation: */
2501         SSE_INLINE_BEGIN_2(&t[jdx],v)
2502           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2503 
2504           /* First Column */
2505           SSE_COPY_PS(XMM0,XMM6)
2506           SSE_SHUFFLE(XMM0,XMM0,0x00)
2507           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2508           SSE_SUB_PS(XMM7,XMM0)
2509 
2510           /* Second Column */
2511           SSE_COPY_PS(XMM1,XMM6)
2512           SSE_SHUFFLE(XMM1,XMM1,0x55)
2513           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2514           SSE_SUB_PS(XMM7,XMM1)
2515 
2516           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2517 
2518           /* Third Column */
2519           SSE_COPY_PS(XMM2,XMM6)
2520           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2521           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2522           SSE_SUB_PS(XMM7,XMM2)
2523 
2524           /* Fourth Column */
2525           SSE_COPY_PS(XMM3,XMM6)
2526           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2527           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2528           SSE_SUB_PS(XMM7,XMM3)
2529         SSE_INLINE_END_2
2530 
2531         v  += 16;
2532       }
2533       v    =  aa + 16*ai[++i];
2534       PREFETCH_NTA(v);
2535       STORE_PS(&t[idx],XMM7);
2536     }
2537 
2538     /* Backward solve the upper triangular factor.*/
2539 
2540     idt  = 4*(n-1);
2541     ai16 = 16*diag[n-1];
2542     v    = aa + ai16 + 16;
2543     for (i=n-1; i>=0;){
2544       PREFETCH_NTA(&v[8]);
2545       vi = aj + diag[i] + 1;
2546       nz = ai[i+1] - diag[i] - 1;
2547 
2548       LOAD_PS(&t[idt],XMM7);
2549 
2550       while (nz--) {
2551         PREFETCH_NTA(&v[16]);
2552         idx = 4*(*vi++);
2553 /*          idx = *vi++; */
2554 
2555         /* 4x4 Matrix-Vector Product with negative accumulation: */
2556         SSE_INLINE_BEGIN_2(&t[idx],v)
2557           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2558 
2559           /* First Column */
2560           SSE_COPY_PS(XMM0,XMM6)
2561           SSE_SHUFFLE(XMM0,XMM0,0x00)
2562           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2563           SSE_SUB_PS(XMM7,XMM0)
2564 
2565           /* Second Column */
2566           SSE_COPY_PS(XMM1,XMM6)
2567           SSE_SHUFFLE(XMM1,XMM1,0x55)
2568           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2569           SSE_SUB_PS(XMM7,XMM1)
2570 
2571           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2572 
2573           /* Third Column */
2574           SSE_COPY_PS(XMM2,XMM6)
2575           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2576           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2577           SSE_SUB_PS(XMM7,XMM2)
2578 
2579           /* Fourth Column */
2580           SSE_COPY_PS(XMM3,XMM6)
2581           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2582           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2583           SSE_SUB_PS(XMM7,XMM3)
2584         SSE_INLINE_END_2
2585         v  += 16;
2586       }
2587       v    = aa + ai16;
2588       ai16 = 16*diag[--i];
2589       PREFETCH_NTA(aa+ai16+16);
2590       /*
2591          Scale the result by the diagonal 4x4 block,
2592          which was inverted as part of the factorization
2593       */
2594       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2595         /* First Column */
2596         SSE_COPY_PS(XMM0,XMM7)
2597         SSE_SHUFFLE(XMM0,XMM0,0x00)
2598         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2599 
2600         /* Second Column */
2601         SSE_COPY_PS(XMM1,XMM7)
2602         SSE_SHUFFLE(XMM1,XMM1,0x55)
2603         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2604         SSE_ADD_PS(XMM0,XMM1)
2605 
2606         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2607 
2608         /* Third Column */
2609         SSE_COPY_PS(XMM2,XMM7)
2610         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2612         SSE_ADD_PS(XMM0,XMM2)
2613 
2614         /* Fourth Column */
2615         SSE_COPY_PS(XMM3,XMM7)
2616         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2618         SSE_ADD_PS(XMM0,XMM3)
2619 
2620         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2621       SSE_INLINE_END_3
2622 
2623       v    = aa + ai16 + 16;
2624       idt -= 4;
2625     }
2626 
2627     /* Convert t from single precision back to double precision (inplace)*/
2628     idt = 4*(n-1);
2629     for (i=n-1;i>=0;i--) {
2630       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2631       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2632       PetscScalar *xtemp=&x[idt];
2633       MatScalar   *ttemp=&t[idt];
2634       xtemp[3] = (PetscScalar)ttemp[3];
2635       xtemp[2] = (PetscScalar)ttemp[2];
2636       xtemp[1] = (PetscScalar)ttemp[1];
2637       xtemp[0] = (PetscScalar)ttemp[0];
2638       idt -= 4;
2639     }
2640 
2641   } /* End of artificial scope. */
2642   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2643   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2644   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2645   SSE_SCOPE_END;
2646   PetscFunctionReturn(0);
2647 }
2648 
2649 #endif
2650 #undef __FUNCT__
2651 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2652 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2653 {
2654   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2655   IS              iscol=a->col,isrow=a->row;
2656   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2657   int             *diag = a->diag;
2658   MatScalar       *aa=a->a,*v;
2659   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2660 
2661   PetscFunctionBegin;
2662   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2663   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2664   t  = a->solve_work;
2665 
2666   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2667   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2668 
2669   /* forward solve the lower triangular */
2670   idx    = 3*(*r++);
2671   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2672   for (i=1; i<n; i++) {
2673     v     = aa + 9*ai[i];
2674     vi    = aj + ai[i];
2675     nz    = diag[i] - ai[i];
2676     idx   = 3*(*r++);
2677     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2678     while (nz--) {
2679       idx   = 3*(*vi++);
2680       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2681       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2682       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2683       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2684       v += 9;
2685     }
2686     idx = 3*i;
2687     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2688   }
2689   /* backward solve the upper triangular */
2690   for (i=n-1; i>=0; i--){
2691     v    = aa + 9*diag[i] + 9;
2692     vi   = aj + diag[i] + 1;
2693     nz   = ai[i+1] - diag[i] - 1;
2694     idt  = 3*i;
2695     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2696     while (nz--) {
2697       idx   = 3*(*vi++);
2698       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2699       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2700       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2701       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2702       v += 9;
2703     }
2704     idc = 3*(*c--);
2705     v   = aa + 9*diag[i];
2706     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2707     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2708     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2709   }
2710   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2711   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2712   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2714   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2715   PetscFunctionReturn(0);
2716 }
2717 
2718 /*
2719       Special case where the matrix was ILU(0) factored in the natural
2720    ordering. This eliminates the need for the column and row permutation.
2721 */
2722 #undef __FUNCT__
2723 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2724 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2725 {
2726   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2727   int             n=a->mbs,*ai=a->i,*aj=a->j;
2728   int             ierr,*diag = a->diag;
2729   MatScalar       *aa=a->a,*v;
2730   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2731   int             jdx,idt,idx,nz,*vi,i;
2732 
2733   PetscFunctionBegin;
2734   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2735   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2736 
2737 
2738   /* forward solve the lower triangular */
2739   idx    = 0;
2740   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2741   for (i=1; i<n; i++) {
2742     v     =  aa      + 9*ai[i];
2743     vi    =  aj      + ai[i];
2744     nz    =  diag[i] - ai[i];
2745     idx   +=  3;
2746     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2747     while (nz--) {
2748       jdx   = 3*(*vi++);
2749       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2750       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2751       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2752       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2753       v    += 9;
2754     }
2755     x[idx]   = s1;
2756     x[1+idx] = s2;
2757     x[2+idx] = s3;
2758   }
2759   /* backward solve the upper triangular */
2760   for (i=n-1; i>=0; i--){
2761     v    = aa + 9*diag[i] + 9;
2762     vi   = aj + diag[i] + 1;
2763     nz   = ai[i+1] - diag[i] - 1;
2764     idt  = 3*i;
2765     s1 = x[idt];  s2 = x[1+idt];
2766     s3 = x[2+idt];
2767     while (nz--) {
2768       idx   = 3*(*vi++);
2769       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2770       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2771       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2772       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2773       v    += 9;
2774     }
2775     v        = aa +  9*diag[i];
2776     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2777     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2778     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2779   }
2780 
2781   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2782   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2783   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2784   PetscFunctionReturn(0);
2785 }
2786 
2787 #undef __FUNCT__
2788 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2789 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2790 {
2791   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2792   IS              iscol=a->col,isrow=a->row;
2793   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2794   int             *diag = a->diag;
2795   MatScalar       *aa=a->a,*v;
2796   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2797 
2798   PetscFunctionBegin;
2799   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2801   t  = a->solve_work;
2802 
2803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2805 
2806   /* forward solve the lower triangular */
2807   idx    = 2*(*r++);
2808   t[0] = b[idx]; t[1] = b[1+idx];
2809   for (i=1; i<n; i++) {
2810     v     = aa + 4*ai[i];
2811     vi    = aj + ai[i];
2812     nz    = diag[i] - ai[i];
2813     idx   = 2*(*r++);
2814     s1  = b[idx]; s2 = b[1+idx];
2815     while (nz--) {
2816       idx   = 2*(*vi++);
2817       x1    = t[idx]; x2 = t[1+idx];
2818       s1 -= v[0]*x1 + v[2]*x2;
2819       s2 -= v[1]*x1 + v[3]*x2;
2820       v += 4;
2821     }
2822     idx = 2*i;
2823     t[idx] = s1; t[1+idx] = s2;
2824   }
2825   /* backward solve the upper triangular */
2826   for (i=n-1; i>=0; i--){
2827     v    = aa + 4*diag[i] + 4;
2828     vi   = aj + diag[i] + 1;
2829     nz   = ai[i+1] - diag[i] - 1;
2830     idt  = 2*i;
2831     s1 = t[idt]; s2 = t[1+idt];
2832     while (nz--) {
2833       idx   = 2*(*vi++);
2834       x1    = t[idx]; x2 = t[1+idx];
2835       s1 -= v[0]*x1 + v[2]*x2;
2836       s2 -= v[1]*x1 + v[3]*x2;
2837       v += 4;
2838     }
2839     idc = 2*(*c--);
2840     v   = aa + 4*diag[i];
2841     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2842     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2843   }
2844   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2845   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2846   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2847   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2848   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2849   PetscFunctionReturn(0);
2850 }
2851 
2852 /*
2853       Special case where the matrix was ILU(0) factored in the natural
2854    ordering. This eliminates the need for the column and row permutation.
2855 */
2856 #undef __FUNCT__
2857 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2858 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859 {
2860   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2861   int             n=a->mbs,*ai=a->i,*aj=a->j;
2862   int             ierr,*diag = a->diag;
2863   MatScalar       *aa=a->a,*v;
2864   PetscScalar     *x,*b,s1,s2,x1,x2;
2865   int             jdx,idt,idx,nz,*vi,i;
2866 
2867   PetscFunctionBegin;
2868   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2869   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2870 
2871   /* forward solve the lower triangular */
2872   idx    = 0;
2873   x[0]   = b[0]; x[1] = b[1];
2874   for (i=1; i<n; i++) {
2875     v     =  aa      + 4*ai[i];
2876     vi    =  aj      + ai[i];
2877     nz    =  diag[i] - ai[i];
2878     idx   +=  2;
2879     s1  =  b[idx];s2 = b[1+idx];
2880     while (nz--) {
2881       jdx   = 2*(*vi++);
2882       x1    = x[jdx];x2 = x[1+jdx];
2883       s1 -= v[0]*x1 + v[2]*x2;
2884       s2 -= v[1]*x1 + v[3]*x2;
2885       v    += 4;
2886     }
2887     x[idx]   = s1;
2888     x[1+idx] = s2;
2889   }
2890   /* backward solve the upper triangular */
2891   for (i=n-1; i>=0; i--){
2892     v    = aa + 4*diag[i] + 4;
2893     vi   = aj + diag[i] + 1;
2894     nz   = ai[i+1] - diag[i] - 1;
2895     idt  = 2*i;
2896     s1 = x[idt];  s2 = x[1+idt];
2897     while (nz--) {
2898       idx   = 2*(*vi++);
2899       x1    = x[idx];   x2 = x[1+idx];
2900       s1 -= v[0]*x1 + v[2]*x2;
2901       s2 -= v[1]*x1 + v[3]*x2;
2902       v    += 4;
2903     }
2904     v        = aa +  4*diag[i];
2905     x[idt]   = v[0]*s1 + v[2]*s2;
2906     x[1+idt] = v[1]*s1 + v[3]*s2;
2907   }
2908 
2909   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2910   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2911   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2912   PetscFunctionReturn(0);
2913 }
2914 
2915 #undef __FUNCT__
2916 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2917 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2918 {
2919   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2920   IS              iscol=a->col,isrow=a->row;
2921   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2922   int             *diag = a->diag;
2923   MatScalar       *aa=a->a,*v;
2924   PetscScalar     *x,*b,s1,*t;
2925 
2926   PetscFunctionBegin;
2927   if (!n) PetscFunctionReturn(0);
2928 
2929   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2930   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2931   t  = a->solve_work;
2932 
2933   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2934   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2935 
2936   /* forward solve the lower triangular */
2937   t[0] = b[*r++];
2938   for (i=1; i<n; i++) {
2939     v     = aa + ai[i];
2940     vi    = aj + ai[i];
2941     nz    = diag[i] - ai[i];
2942     s1  = b[*r++];
2943     while (nz--) {
2944       s1 -= (*v++)*t[*vi++];
2945     }
2946     t[i] = s1;
2947   }
2948   /* backward solve the upper triangular */
2949   for (i=n-1; i>=0; i--){
2950     v    = aa + diag[i] + 1;
2951     vi   = aj + diag[i] + 1;
2952     nz   = ai[i+1] - diag[i] - 1;
2953     s1 = t[i];
2954     while (nz--) {
2955       s1 -= (*v++)*t[*vi++];
2956     }
2957     x[*c--] = t[i] = aa[diag[i]]*s1;
2958   }
2959 
2960   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2961   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2962   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2963   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2964   PetscLogFlops(2*1*(a->nz) - A->n);
2965   PetscFunctionReturn(0);
2966 }
2967 /*
2968       Special case where the matrix was ILU(0) factored in the natural
2969    ordering. This eliminates the need for the column and row permutation.
2970 */
2971 #undef __FUNCT__
2972 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2973 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2974 {
2975   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2976   int             n=a->mbs,*ai=a->i,*aj=a->j;
2977   int             ierr,*diag = a->diag;
2978   MatScalar       *aa=a->a;
2979   PetscScalar     *x,*b;
2980   PetscScalar     s1,x1;
2981   MatScalar       *v;
2982   int             jdx,idt,idx,nz,*vi,i;
2983 
2984   PetscFunctionBegin;
2985   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2986   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2987 
2988   /* forward solve the lower triangular */
2989   idx    = 0;
2990   x[0]   = b[0];
2991   for (i=1; i<n; i++) {
2992     v     =  aa      + ai[i];
2993     vi    =  aj      + ai[i];
2994     nz    =  diag[i] - ai[i];
2995     idx   +=  1;
2996     s1  =  b[idx];
2997     while (nz--) {
2998       jdx   = *vi++;
2999       x1    = x[jdx];
3000       s1 -= v[0]*x1;
3001       v    += 1;
3002     }
3003     x[idx]   = s1;
3004   }
3005   /* backward solve the upper triangular */
3006   for (i=n-1; i>=0; i--){
3007     v    = aa + diag[i] + 1;
3008     vi   = aj + diag[i] + 1;
3009     nz   = ai[i+1] - diag[i] - 1;
3010     idt  = i;
3011     s1 = x[idt];
3012     while (nz--) {
3013       idx   = *vi++;
3014       x1    = x[idx];
3015       s1 -= v[0]*x1;
3016       v    += 1;
3017     }
3018     v        = aa +  diag[i];
3019     x[idt]   = v[0]*s1;
3020   }
3021   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3022   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3023   PetscLogFlops(2*(a->nz) - A->n);
3024   PetscFunctionReturn(0);
3025 }
3026 
3027 /* ----------------------------------------------------------------*/
3028 /*
3029      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3030    except that the data structure of Mat_SeqAIJ is slightly different.
3031    Not a good example of code reuse.
3032 */
3033 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3034 
3035 #undef __FUNCT__
3036 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3037 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3038 {
3039   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3040   IS          isicol;
3041   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3042   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3043   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3044   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3045   PetscTruth  col_identity,row_identity;
3046   PetscReal   f;
3047 
3048   PetscFunctionBegin;
3049   f             = info->fill;
3050   levels        = (int)info->levels;
3051   diagonal_fill = (int)info->diagonal_fill;
3052   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3053   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3054   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3055 
3056   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3057     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3058     (*fact)->factor = FACTOR_LU;
3059     b               = (Mat_SeqBAIJ*)(*fact)->data;
3060     if (!b->diag) {
3061       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3062     }
3063     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3064     b->row        = isrow;
3065     b->col        = iscol;
3066     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3067     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3068     b->icol       = isicol;
3069     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3070     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3071   } else { /* general case perform the symbolic factorization */
3072     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3073     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3074 
3075     /* get new row pointers */
3076     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
3077     ainew[0] = 0;
3078     /* don't know how many column pointers are needed so estimate */
3079     jmax = (int)(f*ai[n] + 1);
3080     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
3081     /* ajfill is level of fill for each fill entry */
3082     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
3083     /* fill is a linked list of nonzeros in active row */
3084     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
3085     /* im is level for each filled value */
3086     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
3087     /* dloc is location of diagonal in factor */
3088     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
3089     dloc[0]  = 0;
3090     for (prow=0; prow<n; prow++) {
3091 
3092       /* copy prow into linked list */
3093       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3094       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3095       xi         = aj + ai[r[prow]];
3096       fill[n]    = n;
3097       fill[prow] = -1; /* marker for diagonal entry */
3098       while (nz--) {
3099 	fm  = n;
3100 	idx = ic[*xi++];
3101 	do {
3102 	  m  = fm;
3103 	  fm = fill[m];
3104 	} while (fm < idx);
3105 	fill[m]   = idx;
3106 	fill[idx] = fm;
3107 	im[idx]   = 0;
3108       }
3109 
3110       /* make sure diagonal entry is included */
3111       if (diagonal_fill && fill[prow] == -1) {
3112 	fm = n;
3113 	while (fill[fm] < prow) fm = fill[fm];
3114 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3115 	fill[fm]   = prow;
3116 	im[prow]   = 0;
3117 	nzf++;
3118 	dcount++;
3119       }
3120 
3121       nzi = 0;
3122       row = fill[n];
3123       while (row < prow) {
3124 	incrlev = im[row] + 1;
3125 	nz      = dloc[row];
3126 	xi      = ajnew  + ainew[row] + nz + 1;
3127 	flev    = ajfill + ainew[row] + nz + 1;
3128 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3129 	fm      = row;
3130 	while (nnz-- > 0) {
3131 	  idx = *xi++;
3132 	  if (*flev + incrlev > levels) {
3133 	    flev++;
3134 	    continue;
3135 	  }
3136 	  do {
3137 	    m  = fm;
3138 	    fm = fill[m];
3139 	  } while (fm < idx);
3140 	  if (fm != idx) {
3141 	    im[idx]   = *flev + incrlev;
3142 	    fill[m]   = idx;
3143 	    fill[idx] = fm;
3144 	    fm        = idx;
3145 	    nzf++;
3146 	  } else {
3147 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3148 	  }
3149 	  flev++;
3150 	}
3151 	row = fill[row];
3152 	nzi++;
3153       }
3154       /* copy new filled row into permanent storage */
3155       ainew[prow+1] = ainew[prow] + nzf;
3156       if (ainew[prow+1] > jmax) {
3157 
3158 	/* estimate how much additional space we will need */
3159 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3160 	/* just double the memory each time */
3161 	int maxadd = jmax;
3162 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3163 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3164 	jmax += maxadd;
3165 
3166 	/* allocate a longer ajnew and ajfill */
3167 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3168 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3169 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3170 	ajnew = xi;
3171 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3172 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3173 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3174 	ajfill = xi;
3175 	reallocate++; /* count how many reallocations are needed */
3176       }
3177       xi          = ajnew + ainew[prow];
3178       flev        = ajfill + ainew[prow];
3179       dloc[prow]  = nzi;
3180       fm          = fill[n];
3181       while (nzf--) {
3182 	*xi++   = fm;
3183 	*flev++ = im[fm];
3184 	fm      = fill[fm];
3185       }
3186       /* make sure row has diagonal entry */
3187       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3188 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3189     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3190       }
3191     }
3192     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3193     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3194     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3195     ierr = PetscFree(fill);CHKERRQ(ierr);
3196     ierr = PetscFree(im);CHKERRQ(ierr);
3197 
3198     {
3199       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3200       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3201       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3202       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3203       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3204       if (diagonal_fill) {
3205 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3206       }
3207     }
3208 
3209     /* put together the new matrix */
3210     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
3211     PetscLogObjectParent(*fact,isicol);
3212     b = (Mat_SeqBAIJ*)(*fact)->data;
3213     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3214     b->singlemalloc = PETSC_FALSE;
3215     len = bs2*ainew[n]*sizeof(MatScalar);
3216     /* the next line frees the default space generated by the Create() */
3217     ierr = PetscFree(b->a);CHKERRQ(ierr);
3218     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3219     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3220     b->j          = ajnew;
3221     b->i          = ainew;
3222     for (i=0; i<n; i++) dloc[i] += ainew[i];
3223     b->diag       = dloc;
3224     b->ilen       = 0;
3225     b->imax       = 0;
3226     b->row        = isrow;
3227     b->col        = iscol;
3228     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3230     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3231     b->icol       = isicol;
3232     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3233     /* In b structure:  Free imax, ilen, old a, old j.
3234        Allocate dloc, solve_work, new a, new j */
3235     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3236     b->maxnz          = b->nz = ainew[n];
3237     (*fact)->factor   = FACTOR_LU;
3238 
3239     (*fact)->info.factor_mallocs    = reallocate;
3240     (*fact)->info.fill_ratio_given  = f;
3241     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242   }
3243 
3244   if (row_identity && col_identity) {
3245     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3246   }
3247   PetscFunctionReturn(0);
3248 }
3249 
3250 #undef __FUNCT__
3251 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3252 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3253 {
3254   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3255   /* int i,*AJ=a->j,nz=a->nz; */
3256   PetscFunctionBegin;
3257   /* Undo Column scaling */
3258 /*    while (nz--) { */
3259 /*      AJ[i] = AJ[i]/4; */
3260 /*    } */
3261   /* This should really invoke a push/pop logic, but we don't have that yet. */
3262   A->ops->setunfactored = PETSC_NULL;
3263   PetscFunctionReturn(0);
3264 }
3265 
3266 #undef __FUNCT__
3267 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3268 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3269 {
3270   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3271   int *AJ=a->j,nz=a->nz;
3272   unsigned short *aj=(unsigned short *)AJ;
3273   PetscFunctionBegin;
3274   /* Is this really necessary? */
3275   while (nz--) {
3276     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3277   }
3278   A->ops->setunfactored = PETSC_NULL;
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 #undef __FUNCT__
3283 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3284 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3285 {
3286   /*
3287       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3288       with natural ordering
3289   */
3290   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3291 
3292   PetscFunctionBegin;
3293   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3294   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3295   switch (a->bs) {
3296   case 1:
3297     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299     break;
3300   case 2:
3301     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303     break;
3304   case 3:
3305     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307     break;
3308   case 4:
3309 #if defined(PETSC_USE_MAT_SINGLE)
3310     {
3311       PetscTruth  sse_enabled_local;
3312       int         ierr;
3313       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3314       if (sse_enabled_local) {
3315 #  if defined(PETSC_HAVE_SSE)
3316         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3317         if (n==(unsigned short)n) {
3318           unsigned short *aj=(unsigned short *)AJ;
3319           for (i=0;i<nz;i++) {
3320             aj[i] = (unsigned short)AJ[i];
3321           }
3322           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3324           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3325         } else {
3326         /* Scale the column indices for easier indexing in MatSolve. */
3327 /*            for (i=0;i<nz;i++) { */
3328 /*              AJ[i] = AJ[i]*4; */
3329 /*            } */
3330           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3331           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3332           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3333         }
3334 #  else
3335       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3336         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337 #  endif
3338       } else {
3339         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3341       }
3342     }
3343 #else
3344     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346 #endif
3347     break;
3348   case 5:
3349     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351     break;
3352   case 6:
3353     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355     break;
3356   case 7:
3357     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359     break;
3360   }
3361   PetscFunctionReturn(0);
3362 }
3363 
3364 #undef __FUNCT__
3365 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3366 int MatSeqBAIJ_UpdateSolvers(Mat A)
3367 {
3368   /*
3369       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3370       with natural ordering
3371   */
3372   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3373   IS          row = a->row, col = a->col;
3374   PetscTruth  row_identity, col_identity;
3375   PetscTruth  use_natural;
3376   int         ierr;
3377 
3378   PetscFunctionBegin;
3379 
3380   use_natural = PETSC_FALSE;
3381 
3382   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3383   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3384 
3385   if (row_identity && col_identity) {
3386     use_natural = PETSC_TRUE;
3387   } else {
3388     use_natural = PETSC_FALSE;
3389   }
3390   switch (a->bs) {
3391   case 1:
3392     if (use_natural) {
3393       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3394       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3395       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3396       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3397     } else {
3398       A->ops->solve           = MatSolve_SeqBAIJ_1;
3399       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3400     }
3401     break;
3402   case 2:
3403     if (use_natural) {
3404       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3405       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3406       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3407       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3408     } else {
3409       A->ops->solve           = MatSolve_SeqBAIJ_2;
3410       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3411     }
3412     break;
3413   case 3:
3414     if (use_natural) {
3415       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3416       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3417       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3418       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3419     } else {
3420       A->ops->solve           = MatSolve_SeqBAIJ_3;
3421       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3422     }
3423     break;
3424   case 4:
3425     {
3426       PetscTruth sse_enabled_local;
3427       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3428       if (use_natural) {
3429 #if defined(PETSC_USE_MAT_SINGLE)
3430         if (sse_enabled_local) { /* Natural + Single + SSE */
3431 #  if defined(PETSC_HAVE_SSE)
3432           int n=a->mbs;
3433           if (n==(unsigned short)n) {
3434             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3435             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3436           } else {
3437             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3438             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3439           }
3440 #  else
3441           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3442           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3443 #  endif
3444         } else { /* Natural + Single */
3445           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3446           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3447         }
3448 #else
3449         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3450         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3451 #endif
3452         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3453         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3454       } else { /* Arbitrary ordering */
3455 #if defined(PETSC_USE_MAT_SINGLE)
3456         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3457 #  if defined(PETSC_HAVE_SSE)
3458           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3459           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3460 #  else
3461           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3462           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3463 #  endif
3464         } else { /* Arbitrary + Single */
3465           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3466           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3467         }
3468 #else
3469         A->ops->solve           = MatSolve_SeqBAIJ_4;
3470 #endif
3471         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3472       }
3473     }
3474     break;
3475   case 5:
3476     if (use_natural) {
3477       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3478       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3479       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3480       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3481     } else {
3482       A->ops->solve           = MatSolve_SeqBAIJ_5;
3483       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3484     }
3485     break;
3486   case 6:
3487     if (use_natural) {
3488       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3489       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3490       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3491       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3492     } else {
3493       A->ops->solve           = MatSolve_SeqBAIJ_6;
3494       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3495     }
3496     break;
3497   case 7:
3498     if (use_natural) {
3499       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3500       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3501       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3502       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3503     } else {
3504       A->ops->solve           = MatSolve_SeqBAIJ_7;
3505       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3506     }
3507     break;
3508   default:
3509     A->ops->solve             = MatSolve_SeqBAIJ_N;
3510     break;
3511   }
3512   PetscFunctionReturn(0);
3513 }
3514 
3515 #undef __FUNCT__
3516 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3517 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3518   int ierr;
3519 
3520   PetscFunctionBegin;
3521   ierr = MatSeqBAIJ_UpdateSolvers(A);
3522   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3523     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3524   } else {
3525     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3526   }
3527   PetscFunctionReturn(0);
3528 }
3529 
3530 #undef __FUNCT__
3531 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3532 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3533   int ierr;
3534 
3535   PetscFunctionBegin;
3536   ierr = MatSeqBAIJ_UpdateSolvers(A);
3537   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3538   PetscFunctionReturn(0);
3539 }
3540 
3541 
3542