xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 9c30b7d2697335155d7490a7e085415ee7b4a02a)
1 /*
2     Factorization code for BAIJ format.
3 */
4 
5 #include "src/mat/impls/baij/seq/baij.h"
6 #include "src/inline/ilu.h"
7 #include "src/inline/dot.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
11 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12 {
13   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
14   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
15   int             *diag = a->diag;
16   MatScalar       *aa=a->a,*v;
17   PetscScalar     s1,*x,*b;
18 
19   PetscFunctionBegin;
20   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
21   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23 
24   /* forward solve the U^T */
25   for (i=0; i<n; i++) {
26 
27     v     = aa + diag[i];
28     /* multiply by the inverse of the block diagonal */
29     s1    = (*v++)*x[i];
30     vi    = aj + diag[i] + 1;
31     nz    = ai[i+1] - diag[i] - 1;
32     while (nz--) {
33       x[*vi++]  -= (*v++)*s1;
34     }
35     x[i]   = s1;
36   }
37   /* backward solve the L^T */
38   for (i=n-1; i>=0; i--){
39     v    = aa + diag[i] - 1;
40     vi   = aj + diag[i] - 1;
41     nz   = diag[i] - ai[i];
42     s1   = x[i];
43     while (nz--) {
44       x[*vi--]   -=  (*v--)*s1;
45     }
46   }
47   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
48   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49   PetscLogFlops(2*(a->nz) - A->n);
50   PetscFunctionReturn(0);
51 }
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
55 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
56 {
57   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
58   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
59   int             *diag = a->diag,oidx;
60   MatScalar       *aa=a->a,*v;
61   PetscScalar     s1,s2,x1,x2;
62   PetscScalar     *x,*b;
63 
64   PetscFunctionBegin;
65   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
66   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68 
69   /* forward solve the U^T */
70   idx = 0;
71   for (i=0; i<n; i++) {
72 
73     v     = aa + 4*diag[i];
74     /* multiply by the inverse of the block diagonal */
75     x1 = x[idx];   x2 = x[1+idx];
76     s1 = v[0]*x1  +  v[1]*x2;
77     s2 = v[2]*x1  +  v[3]*x2;
78     v += 4;
79 
80     vi    = aj + diag[i] + 1;
81     nz    = ai[i+1] - diag[i] - 1;
82     while (nz--) {
83       oidx = 2*(*vi++);
84       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86       v  += 4;
87     }
88     x[idx]   = s1;x[1+idx] = s2;
89     idx += 2;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--){
93     v    = aa + 4*diag[i] - 4;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     idt  = 2*i;
97     s1   = x[idt];  s2 = x[1+idt];
98     while (nz--) {
99       idx   = 2*(*vi--);
100       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102       v -= 4;
103     }
104   }
105   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107   PetscLogFlops(2*4*(a->nz) - 2*A->n);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114 {
115   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117   int             *diag = a->diag,oidx;
118   MatScalar       *aa=a->a,*v;
119   PetscScalar     s1,s2,s3,x1,x2,x3;
120   PetscScalar     *x,*b;
121 
122   PetscFunctionBegin;
123   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
124   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126 
127   /* forward solve the U^T */
128   idx = 0;
129   for (i=0; i<n; i++) {
130 
131     v     = aa + 9*diag[i];
132     /* multiply by the inverse of the block diagonal */
133     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
137     v += 9;
138 
139     vi    = aj + diag[i] + 1;
140     nz    = ai[i+1] - diag[i] - 1;
141     while (nz--) {
142       oidx = 3*(*vi++);
143       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
144       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
145       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
146       v  += 9;
147     }
148     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
149     idx += 3;
150   }
151   /* backward solve the L^T */
152   for (i=n-1; i>=0; i--){
153     v    = aa + 9*diag[i] - 9;
154     vi   = aj + diag[i] - 1;
155     nz   = diag[i] - ai[i];
156     idt  = 3*i;
157     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
158     while (nz--) {
159       idx   = 3*(*vi--);
160       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
161       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
162       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
163       v -= 9;
164     }
165   }
166   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
168   PetscLogFlops(2*9*(a->nz) - 3*A->n);
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
174 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
175 {
176   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
177   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
178   int             *diag = a->diag,oidx;
179   MatScalar       *aa=a->a,*v;
180   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
181   PetscScalar     *x,*b;
182 
183   PetscFunctionBegin;
184   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
185   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
186   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
187 
188   /* forward solve the U^T */
189   idx = 0;
190   for (i=0; i<n; i++) {
191 
192     v     = aa + 16*diag[i];
193     /* multiply by the inverse of the block diagonal */
194     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
195     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
196     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
197     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
198     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
199     v += 16;
200 
201     vi    = aj + diag[i] + 1;
202     nz    = ai[i+1] - diag[i] - 1;
203     while (nz--) {
204       oidx = 4*(*vi++);
205       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
206       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
207       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
208       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
209       v  += 16;
210     }
211     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
212     idx += 4;
213   }
214   /* backward solve the L^T */
215   for (i=n-1; i>=0; i--){
216     v    = aa + 16*diag[i] - 16;
217     vi   = aj + diag[i] - 1;
218     nz   = diag[i] - ai[i];
219     idt  = 4*i;
220     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
221     while (nz--) {
222       idx   = 4*(*vi--);
223       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
224       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
225       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
226       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
227       v -= 16;
228     }
229   }
230   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
231   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
232   PetscLogFlops(2*16*(a->nz) - 4*A->n);
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
238 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
239 {
240   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
241   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
242   int             *diag = a->diag,oidx;
243   MatScalar       *aa=a->a,*v;
244   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
245   PetscScalar     *x,*b;
246 
247   PetscFunctionBegin;
248   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
249   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251 
252   /* forward solve the U^T */
253   idx = 0;
254   for (i=0; i<n; i++) {
255 
256     v     = aa + 25*diag[i];
257     /* multiply by the inverse of the block diagonal */
258     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
259     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
260     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
261     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
262     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
263     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
264     v += 25;
265 
266     vi    = aj + diag[i] + 1;
267     nz    = ai[i+1] - diag[i] - 1;
268     while (nz--) {
269       oidx = 5*(*vi++);
270       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
271       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
272       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
273       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
274       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
275       v  += 25;
276     }
277     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
278     idx += 5;
279   }
280   /* backward solve the L^T */
281   for (i=n-1; i>=0; i--){
282     v    = aa + 25*diag[i] - 25;
283     vi   = aj + diag[i] - 1;
284     nz   = diag[i] - ai[i];
285     idt  = 5*i;
286     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
287     while (nz--) {
288       idx   = 5*(*vi--);
289       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
290       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
291       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
292       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
293       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
294       v -= 25;
295     }
296   }
297   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
298   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
299   PetscLogFlops(2*25*(a->nz) - 5*A->n);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
305 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
306 {
307   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
308   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
309   int             *diag = a->diag,oidx;
310   MatScalar       *aa=a->a,*v;
311   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
312   PetscScalar     *x,*b;
313 
314   PetscFunctionBegin;
315   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
316   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
317   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
318 
319   /* forward solve the U^T */
320   idx = 0;
321   for (i=0; i<n; i++) {
322 
323     v     = aa + 36*diag[i];
324     /* multiply by the inverse of the block diagonal */
325     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
326     x6    = x[5+idx];
327     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
328     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
329     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
330     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
331     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
332     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
333     v += 36;
334 
335     vi    = aj + diag[i] + 1;
336     nz    = ai[i+1] - diag[i] - 1;
337     while (nz--) {
338       oidx = 6*(*vi++);
339       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
340       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
341       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
342       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
343       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
344       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
345       v  += 36;
346     }
347     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
348     x[5+idx] = s6;
349     idx += 6;
350   }
351   /* backward solve the L^T */
352   for (i=n-1; i>=0; i--){
353     v    = aa + 36*diag[i] - 36;
354     vi   = aj + diag[i] - 1;
355     nz   = diag[i] - ai[i];
356     idt  = 6*i;
357     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
358     s6 = x[5+idt];
359     while (nz--) {
360       idx   = 6*(*vi--);
361       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
362       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
363       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
364       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
365       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
366       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
367       v -= 36;
368     }
369   }
370   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
372   PetscLogFlops(2*36*(a->nz) - 6*A->n);
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
378 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
379 {
380   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
381   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
382   int             *diag = a->diag,oidx;
383   MatScalar       *aa=a->a,*v;
384   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
385   PetscScalar     *x,*b;
386 
387   PetscFunctionBegin;
388   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
389   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
391 
392   /* forward solve the U^T */
393   idx = 0;
394   for (i=0; i<n; i++) {
395 
396     v     = aa + 49*diag[i];
397     /* multiply by the inverse of the block diagonal */
398     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
399     x6    = x[5+idx]; x7 = x[6+idx];
400     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
401     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
402     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
403     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
404     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
405     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
406     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
407     v += 49;
408 
409     vi    = aj + diag[i] + 1;
410     nz    = ai[i+1] - diag[i] - 1;
411     while (nz--) {
412       oidx = 7*(*vi++);
413       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
414       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
415       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
416       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
417       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
418       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
419       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
420       v  += 49;
421     }
422     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
423     x[5+idx] = s6;x[6+idx] = s7;
424     idx += 7;
425   }
426   /* backward solve the L^T */
427   for (i=n-1; i>=0; i--){
428     v    = aa + 49*diag[i] - 49;
429     vi   = aj + diag[i] - 1;
430     nz   = diag[i] - ai[i];
431     idt  = 7*i;
432     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
433     s6 = x[5+idt];s7 = x[6+idt];
434     while (nz--) {
435       idx   = 7*(*vi--);
436       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
437       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
438       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
439       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
440       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
441       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
442       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
443       v -= 49;
444     }
445   }
446   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
447   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
448   PetscLogFlops(2*49*(a->nz) - 7*A->n);
449   PetscFunctionReturn(0);
450 }
451 
452 /*---------------------------------------------------------------------------------------------*/
453 #undef __FUNCT__
454 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
455 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
456 {
457   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
458   IS              iscol=a->col,isrow=a->row;
459   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
460   int             *diag = a->diag;
461   MatScalar       *aa=a->a,*v;
462   PetscScalar     s1,*x,*b,*t;
463 
464   PetscFunctionBegin;
465   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
466   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
467   t  = a->solve_work;
468 
469   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
470   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
471 
472   /* copy the b into temp work space according to permutation */
473   for (i=0; i<n; i++) {
474     t[i] = b[c[i]];
475   }
476 
477   /* forward solve the U^T */
478   for (i=0; i<n; i++) {
479 
480     v     = aa + diag[i];
481     /* multiply by the inverse of the block diagonal */
482     s1    = (*v++)*t[i];
483     vi    = aj + diag[i] + 1;
484     nz    = ai[i+1] - diag[i] - 1;
485     while (nz--) {
486       t[*vi++]  -= (*v++)*s1;
487     }
488     t[i]   = s1;
489   }
490   /* backward solve the L^T */
491   for (i=n-1; i>=0; i--){
492     v    = aa + diag[i] - 1;
493     vi   = aj + diag[i] - 1;
494     nz   = diag[i] - ai[i];
495     s1   = t[i];
496     while (nz--) {
497       t[*vi--]   -=  (*v--)*s1;
498     }
499   }
500 
501   /* copy t into x according to permutation */
502   for (i=0; i<n; i++) {
503     x[r[i]]   = t[i];
504   }
505 
506   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
507   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
508   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
509   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
510   PetscLogFlops(2*(a->nz) - A->n);
511   PetscFunctionReturn(0);
512 }
513 
514 #undef __FUNCT__
515 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
516 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
517 {
518   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
519   IS              iscol=a->col,isrow=a->row;
520   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
521   int             *diag = a->diag,ii,ic,ir,oidx;
522   MatScalar       *aa=a->a,*v;
523   PetscScalar     s1,s2,x1,x2;
524   PetscScalar     *x,*b,*t;
525 
526   PetscFunctionBegin;
527   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
528   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
529   t  = a->solve_work;
530 
531   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
532   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
533 
534   /* copy the b into temp work space according to permutation */
535   ii = 0;
536   for (i=0; i<n; i++) {
537     ic      = 2*c[i];
538     t[ii]   = b[ic];
539     t[ii+1] = b[ic+1];
540     ii += 2;
541   }
542 
543   /* forward solve the U^T */
544   idx = 0;
545   for (i=0; i<n; i++) {
546 
547     v     = aa + 4*diag[i];
548     /* multiply by the inverse of the block diagonal */
549     x1    = t[idx];   x2 = t[1+idx];
550     s1 = v[0]*x1  +  v[1]*x2;
551     s2 = v[2]*x1  +  v[3]*x2;
552     v += 4;
553 
554     vi    = aj + diag[i] + 1;
555     nz    = ai[i+1] - diag[i] - 1;
556     while (nz--) {
557       oidx = 2*(*vi++);
558       t[oidx]   -= v[0]*s1  +  v[1]*s2;
559       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
560       v  += 4;
561     }
562     t[idx]   = s1;t[1+idx] = s2;
563     idx += 2;
564   }
565   /* backward solve the L^T */
566   for (i=n-1; i>=0; i--){
567     v    = aa + 4*diag[i] - 4;
568     vi   = aj + diag[i] - 1;
569     nz   = diag[i] - ai[i];
570     idt  = 2*i;
571     s1 = t[idt];  s2 = t[1+idt];
572     while (nz--) {
573       idx   = 2*(*vi--);
574       t[idx]   -=  v[0]*s1 +  v[1]*s2;
575       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
576       v -= 4;
577     }
578   }
579 
580   /* copy t into x according to permutation */
581   ii = 0;
582   for (i=0; i<n; i++) {
583     ir      = 2*r[i];
584     x[ir]   = t[ii];
585     x[ir+1] = t[ii+1];
586     ii += 2;
587   }
588 
589   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
590   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
591   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
592   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
593   PetscLogFlops(2*4*(a->nz) - 2*A->n);
594   PetscFunctionReturn(0);
595 }
596 
597 #undef __FUNCT__
598 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
599 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
600 {
601   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
602   IS              iscol=a->col,isrow=a->row;
603   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
604   int             *diag = a->diag,ii,ic,ir,oidx;
605   MatScalar       *aa=a->a,*v;
606   PetscScalar     s1,s2,s3,x1,x2,x3;
607   PetscScalar     *x,*b,*t;
608 
609   PetscFunctionBegin;
610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
612   t  = a->solve_work;
613 
614   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
615   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
616 
617   /* copy the b into temp work space according to permutation */
618   ii = 0;
619   for (i=0; i<n; i++) {
620     ic      = 3*c[i];
621     t[ii]   = b[ic];
622     t[ii+1] = b[ic+1];
623     t[ii+2] = b[ic+2];
624     ii += 3;
625   }
626 
627   /* forward solve the U^T */
628   idx = 0;
629   for (i=0; i<n; i++) {
630 
631     v     = aa + 9*diag[i];
632     /* multiply by the inverse of the block diagonal */
633     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
634     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
635     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
636     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
637     v += 9;
638 
639     vi    = aj + diag[i] + 1;
640     nz    = ai[i+1] - diag[i] - 1;
641     while (nz--) {
642       oidx = 3*(*vi++);
643       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
644       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
645       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
646       v  += 9;
647     }
648     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
649     idx += 3;
650   }
651   /* backward solve the L^T */
652   for (i=n-1; i>=0; i--){
653     v    = aa + 9*diag[i] - 9;
654     vi   = aj + diag[i] - 1;
655     nz   = diag[i] - ai[i];
656     idt  = 3*i;
657     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
658     while (nz--) {
659       idx   = 3*(*vi--);
660       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
661       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
662       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v -= 9;
664     }
665   }
666 
667   /* copy t into x according to permutation */
668   ii = 0;
669   for (i=0; i<n; i++) {
670     ir      = 3*r[i];
671     x[ir]   = t[ii];
672     x[ir+1] = t[ii+1];
673     x[ir+2] = t[ii+2];
674     ii += 3;
675   }
676 
677   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
678   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
679   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
681   PetscLogFlops(2*9*(a->nz) - 3*A->n);
682   PetscFunctionReturn(0);
683 }
684 
685 #undef __FUNCT__
686 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
687 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
688 {
689   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
690   IS              iscol=a->col,isrow=a->row;
691   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
692   int             *diag = a->diag,ii,ic,ir,oidx;
693   MatScalar       *aa=a->a,*v;
694   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
695   PetscScalar     *x,*b,*t;
696 
697   PetscFunctionBegin;
698   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
699   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
700   t  = a->solve_work;
701 
702   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
703   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
704 
705   /* copy the b into temp work space according to permutation */
706   ii = 0;
707   for (i=0; i<n; i++) {
708     ic      = 4*c[i];
709     t[ii]   = b[ic];
710     t[ii+1] = b[ic+1];
711     t[ii+2] = b[ic+2];
712     t[ii+3] = b[ic+3];
713     ii += 4;
714   }
715 
716   /* forward solve the U^T */
717   idx = 0;
718   for (i=0; i<n; i++) {
719 
720     v     = aa + 16*diag[i];
721     /* multiply by the inverse of the block diagonal */
722     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
723     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
724     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
725     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
726     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
727     v += 16;
728 
729     vi    = aj + diag[i] + 1;
730     nz    = ai[i+1] - diag[i] - 1;
731     while (nz--) {
732       oidx = 4*(*vi++);
733       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
734       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
735       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
736       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
737       v  += 16;
738     }
739     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
740     idx += 4;
741   }
742   /* backward solve the L^T */
743   for (i=n-1; i>=0; i--){
744     v    = aa + 16*diag[i] - 16;
745     vi   = aj + diag[i] - 1;
746     nz   = diag[i] - ai[i];
747     idt  = 4*i;
748     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
749     while (nz--) {
750       idx   = 4*(*vi--);
751       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
752       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
753       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
754       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
755       v -= 16;
756     }
757   }
758 
759   /* copy t into x according to permutation */
760   ii = 0;
761   for (i=0; i<n; i++) {
762     ir      = 4*r[i];
763     x[ir]   = t[ii];
764     x[ir+1] = t[ii+1];
765     x[ir+2] = t[ii+2];
766     x[ir+3] = t[ii+3];
767     ii += 4;
768   }
769 
770   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
771   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
772   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
773   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
774   PetscLogFlops(2*16*(a->nz) - 4*A->n);
775   PetscFunctionReturn(0);
776 }
777 
778 #undef __FUNCT__
779 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
780 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
781 {
782   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
783   IS              iscol=a->col,isrow=a->row;
784   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
785   int             *diag = a->diag,ii,ic,ir,oidx;
786   MatScalar       *aa=a->a,*v;
787   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
788   PetscScalar     *x,*b,*t;
789 
790   PetscFunctionBegin;
791   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
793   t  = a->solve_work;
794 
795   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
796   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
797 
798   /* copy the b into temp work space according to permutation */
799   ii = 0;
800   for (i=0; i<n; i++) {
801     ic      = 5*c[i];
802     t[ii]   = b[ic];
803     t[ii+1] = b[ic+1];
804     t[ii+2] = b[ic+2];
805     t[ii+3] = b[ic+3];
806     t[ii+4] = b[ic+4];
807     ii += 5;
808   }
809 
810   /* forward solve the U^T */
811   idx = 0;
812   for (i=0; i<n; i++) {
813 
814     v     = aa + 25*diag[i];
815     /* multiply by the inverse of the block diagonal */
816     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
817     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
818     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
819     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
820     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
821     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
822     v += 25;
823 
824     vi    = aj + diag[i] + 1;
825     nz    = ai[i+1] - diag[i] - 1;
826     while (nz--) {
827       oidx = 5*(*vi++);
828       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
829       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
830       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
831       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
832       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
833       v  += 25;
834     }
835     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
836     idx += 5;
837   }
838   /* backward solve the L^T */
839   for (i=n-1; i>=0; i--){
840     v    = aa + 25*diag[i] - 25;
841     vi   = aj + diag[i] - 1;
842     nz   = diag[i] - ai[i];
843     idt  = 5*i;
844     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
845     while (nz--) {
846       idx   = 5*(*vi--);
847       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852       v -= 25;
853     }
854   }
855 
856   /* copy t into x according to permutation */
857   ii = 0;
858   for (i=0; i<n; i++) {
859     ir      = 5*r[i];
860     x[ir]   = t[ii];
861     x[ir+1] = t[ii+1];
862     x[ir+2] = t[ii+2];
863     x[ir+3] = t[ii+3];
864     x[ir+4] = t[ii+4];
865     ii += 5;
866   }
867 
868   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
869   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
870   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
871   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
872   PetscLogFlops(2*25*(a->nz) - 5*A->n);
873   PetscFunctionReturn(0);
874 }
875 
876 #undef __FUNCT__
877 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
878 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
879 {
880   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
881   IS              iscol=a->col,isrow=a->row;
882   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
883   int             *diag = a->diag,ii,ic,ir,oidx;
884   MatScalar       *aa=a->a,*v;
885   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
886   PetscScalar     *x,*b,*t;
887 
888   PetscFunctionBegin;
889   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
890   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
891   t  = a->solve_work;
892 
893   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
894   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
895 
896   /* copy the b into temp work space according to permutation */
897   ii = 0;
898   for (i=0; i<n; i++) {
899     ic      = 6*c[i];
900     t[ii]   = b[ic];
901     t[ii+1] = b[ic+1];
902     t[ii+2] = b[ic+2];
903     t[ii+3] = b[ic+3];
904     t[ii+4] = b[ic+4];
905     t[ii+5] = b[ic+5];
906     ii += 6;
907   }
908 
909   /* forward solve the U^T */
910   idx = 0;
911   for (i=0; i<n; i++) {
912 
913     v     = aa + 36*diag[i];
914     /* multiply by the inverse of the block diagonal */
915     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
916     x6    = t[5+idx];
917     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
918     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
919     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
920     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
921     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
922     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
923     v += 36;
924 
925     vi    = aj + diag[i] + 1;
926     nz    = ai[i+1] - diag[i] - 1;
927     while (nz--) {
928       oidx = 6*(*vi++);
929       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
930       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
931       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
932       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
933       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
934       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
935       v  += 36;
936     }
937     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
938     t[5+idx] = s6;
939     idx += 6;
940   }
941   /* backward solve the L^T */
942   for (i=n-1; i>=0; i--){
943     v    = aa + 36*diag[i] - 36;
944     vi   = aj + diag[i] - 1;
945     nz   = diag[i] - ai[i];
946     idt  = 6*i;
947     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
948     s6 = t[5+idt];
949     while (nz--) {
950       idx   = 6*(*vi--);
951       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
952       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
953       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
954       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
955       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
956       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
957       v -= 36;
958     }
959   }
960 
961   /* copy t into x according to permutation */
962   ii = 0;
963   for (i=0; i<n; i++) {
964     ir      = 6*r[i];
965     x[ir]   = t[ii];
966     x[ir+1] = t[ii+1];
967     x[ir+2] = t[ii+2];
968     x[ir+3] = t[ii+3];
969     x[ir+4] = t[ii+4];
970     x[ir+5] = t[ii+5];
971     ii += 6;
972   }
973 
974   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
975   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
976   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
977   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
978   PetscLogFlops(2*36*(a->nz) - 6*A->n);
979   PetscFunctionReturn(0);
980 }
981 
982 #undef __FUNCT__
983 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
984 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
985 {
986   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
987   IS              iscol=a->col,isrow=a->row;
988   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
989   int             *diag = a->diag,ii,ic,ir,oidx;
990   MatScalar       *aa=a->a,*v;
991   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
992   PetscScalar     *x,*b,*t;
993 
994   PetscFunctionBegin;
995   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
996   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
997   t  = a->solve_work;
998 
999   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1000   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1001 
1002   /* copy the b into temp work space according to permutation */
1003   ii = 0;
1004   for (i=0; i<n; i++) {
1005     ic      = 7*c[i];
1006     t[ii]   = b[ic];
1007     t[ii+1] = b[ic+1];
1008     t[ii+2] = b[ic+2];
1009     t[ii+3] = b[ic+3];
1010     t[ii+4] = b[ic+4];
1011     t[ii+5] = b[ic+5];
1012     t[ii+6] = b[ic+6];
1013     ii += 7;
1014   }
1015 
1016   /* forward solve the U^T */
1017   idx = 0;
1018   for (i=0; i<n; i++) {
1019 
1020     v     = aa + 49*diag[i];
1021     /* multiply by the inverse of the block diagonal */
1022     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1023     x6    = t[5+idx]; x7 = t[6+idx];
1024     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1025     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1026     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1027     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1028     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1029     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1030     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1031     v += 49;
1032 
1033     vi    = aj + diag[i] + 1;
1034     nz    = ai[i+1] - diag[i] - 1;
1035     while (nz--) {
1036       oidx = 7*(*vi++);
1037       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1038       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1039       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1040       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1041       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1042       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1043       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1044       v  += 49;
1045     }
1046     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1047     t[5+idx] = s6;t[6+idx] = s7;
1048     idx += 7;
1049   }
1050   /* backward solve the L^T */
1051   for (i=n-1; i>=0; i--){
1052     v    = aa + 49*diag[i] - 49;
1053     vi   = aj + diag[i] - 1;
1054     nz   = diag[i] - ai[i];
1055     idt  = 7*i;
1056     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1057     s6 = t[5+idt];s7 = t[6+idt];
1058     while (nz--) {
1059       idx   = 7*(*vi--);
1060       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067       v -= 49;
1068     }
1069   }
1070 
1071   /* copy t into x according to permutation */
1072   ii = 0;
1073   for (i=0; i<n; i++) {
1074     ir      = 7*r[i];
1075     x[ir]   = t[ii];
1076     x[ir+1] = t[ii+1];
1077     x[ir+2] = t[ii+2];
1078     x[ir+3] = t[ii+3];
1079     x[ir+4] = t[ii+4];
1080     x[ir+5] = t[ii+5];
1081     x[ir+6] = t[ii+6];
1082     ii += 7;
1083   }
1084 
1085   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1086   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1087   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1088   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1089   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1090   PetscFunctionReturn(0);
1091 }
1092 
1093 /* ----------------------------------------------------------- */
1094 #undef __FUNCT__
1095 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1096 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1097 {
1098   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1099   IS              iscol=a->col,isrow=a->row;
1100   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1101   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1102   MatScalar       *aa=a->a,*v;
1103   PetscScalar     *x,*b,*s,*t,*ls;
1104 
1105   PetscFunctionBegin;
1106   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1107   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1108   t  = a->solve_work;
1109 
1110   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1111   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1112 
1113   /* forward solve the lower triangular */
1114   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1115   for (i=1; i<n; i++) {
1116     v   = aa + bs2*ai[i];
1117     vi  = aj + ai[i];
1118     nz  = a->diag[i] - ai[i];
1119     s = t + bs*i;
1120     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1121     while (nz--) {
1122       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1123       v += bs2;
1124     }
1125   }
1126   /* backward solve the upper triangular */
1127   ls = a->solve_work + A->n;
1128   for (i=n-1; i>=0; i--){
1129     v   = aa + bs2*(a->diag[i] + 1);
1130     vi  = aj + a->diag[i] + 1;
1131     nz  = ai[i+1] - a->diag[i] - 1;
1132     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1133     while (nz--) {
1134       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1135       v += bs2;
1136     }
1137     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1138     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1139   }
1140 
1141   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1142   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1143   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1144   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1145   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 #undef __FUNCT__
1150 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1151 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1152 {
1153   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1154   IS              iscol=a->col,isrow=a->row;
1155   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1156   int             *diag = a->diag;
1157   MatScalar       *aa=a->a,*v;
1158   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1159   PetscScalar     *x,*b,*t;
1160 
1161   PetscFunctionBegin;
1162   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1163   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1164   t  = a->solve_work;
1165 
1166   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1167   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1168 
1169   /* forward solve the lower triangular */
1170   idx    = 7*(*r++);
1171   t[0] = b[idx];   t[1] = b[1+idx];
1172   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1173   t[5] = b[5+idx]; t[6] = b[6+idx];
1174 
1175   for (i=1; i<n; i++) {
1176     v     = aa + 49*ai[i];
1177     vi    = aj + ai[i];
1178     nz    = diag[i] - ai[i];
1179     idx   = 7*(*r++);
1180     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1181     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1182     while (nz--) {
1183       idx   = 7*(*vi++);
1184       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1185       x4    = t[3+idx];x5 = t[4+idx];
1186       x6    = t[5+idx];x7 = t[6+idx];
1187       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1188       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1189       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1190       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1191       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1192       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1193       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1194       v += 49;
1195     }
1196     idx = 7*i;
1197     t[idx]   = s1;t[1+idx] = s2;
1198     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1199     t[5+idx] = s6;t[6+idx] = s7;
1200   }
1201   /* backward solve the upper triangular */
1202   for (i=n-1; i>=0; i--){
1203     v    = aa + 49*diag[i] + 49;
1204     vi   = aj + diag[i] + 1;
1205     nz   = ai[i+1] - diag[i] - 1;
1206     idt  = 7*i;
1207     s1 = t[idt];  s2 = t[1+idt];
1208     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1209     s6 = t[5+idt];s7 = t[6+idt];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];   x2 = t[1+idx];
1213       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1214       x6    = t[5+idx]; x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idc = 7*(*c--);
1225     v   = aa + 49*diag[i];
1226     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1227                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1228     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1229                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1230     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1231                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1232     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1233                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1234     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1235                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1236     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1237                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1238     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1239                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1240   }
1241 
1242   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1243   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1244   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1245   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1246   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 #undef __FUNCT__
1251 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1252 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1253 {
1254   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1255   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1256   int             ierr,*diag = a->diag,jdx;
1257   MatScalar       *aa=a->a,*v;
1258   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1259 
1260   PetscFunctionBegin;
1261   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1262   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1263   /* forward solve the lower triangular */
1264   idx    = 0;
1265   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1266   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1267   x[6] = b[6+idx];
1268   for (i=1; i<n; i++) {
1269     v     =  aa + 49*ai[i];
1270     vi    =  aj + ai[i];
1271     nz    =  diag[i] - ai[i];
1272     idx   =  7*i;
1273     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1274     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1275     s7  =  b[6+idx];
1276     while (nz--) {
1277       jdx   = 7*(*vi++);
1278       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1279       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1280       x7    = x[6+jdx];
1281       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1282       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1283       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1284       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1285       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1286       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1287       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1288       v += 49;
1289      }
1290     x[idx]   = s1;
1291     x[1+idx] = s2;
1292     x[2+idx] = s3;
1293     x[3+idx] = s4;
1294     x[4+idx] = s5;
1295     x[5+idx] = s6;
1296     x[6+idx] = s7;
1297   }
1298   /* backward solve the upper triangular */
1299   for (i=n-1; i>=0; i--){
1300     v    = aa + 49*diag[i] + 49;
1301     vi   = aj + diag[i] + 1;
1302     nz   = ai[i+1] - diag[i] - 1;
1303     idt  = 7*i;
1304     s1 = x[idt];   s2 = x[1+idt];
1305     s3 = x[2+idt]; s4 = x[3+idt];
1306     s5 = x[4+idt]; s6 = x[5+idt];
1307     s7 = x[6+idt];
1308     while (nz--) {
1309       idx   = 7*(*vi++);
1310       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1311       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1312       x7    = x[6+idx];
1313       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1314       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1315       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1316       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1317       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1318       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1319       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1320       v += 49;
1321     }
1322     v        = aa + 49*diag[i];
1323     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1324                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1325     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1326                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1327     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1328                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1329     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1330                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1331     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1332                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1333     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1334                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1335     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1336                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1337   }
1338 
1339   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1340   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1341   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1342   PetscFunctionReturn(0);
1343 }
1344 
1345 #undef __FUNCT__
1346 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1347 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1348 {
1349   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1350   IS              iscol=a->col,isrow=a->row;
1351   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1352   int             *diag = a->diag;
1353   MatScalar       *aa=a->a,*v;
1354   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1355 
1356   PetscFunctionBegin;
1357   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1358   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1359   t  = a->solve_work;
1360 
1361   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1362   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1363 
1364   /* forward solve the lower triangular */
1365   idx    = 6*(*r++);
1366   t[0] = b[idx];   t[1] = b[1+idx];
1367   t[2] = b[2+idx]; t[3] = b[3+idx];
1368   t[4] = b[4+idx]; t[5] = b[5+idx];
1369   for (i=1; i<n; i++) {
1370     v     = aa + 36*ai[i];
1371     vi    = aj + ai[i];
1372     nz    = diag[i] - ai[i];
1373     idx   = 6*(*r++);
1374     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1375     s5  = b[4+idx]; s6 = b[5+idx];
1376     while (nz--) {
1377       idx   = 6*(*vi++);
1378       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1379       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1380       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1381       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1382       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1383       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1384       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1385       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1386       v += 36;
1387     }
1388     idx = 6*i;
1389     t[idx]   = s1;t[1+idx] = s2;
1390     t[2+idx] = s3;t[3+idx] = s4;
1391     t[4+idx] = s5;t[5+idx] = s6;
1392   }
1393   /* backward solve the upper triangular */
1394   for (i=n-1; i>=0; i--){
1395     v    = aa + 36*diag[i] + 36;
1396     vi   = aj + diag[i] + 1;
1397     nz   = ai[i+1] - diag[i] - 1;
1398     idt  = 6*i;
1399     s1 = t[idt];  s2 = t[1+idt];
1400     s3 = t[2+idt];s4 = t[3+idt];
1401     s5 = t[4+idt];s6 = t[5+idt];
1402     while (nz--) {
1403       idx   = 6*(*vi++);
1404       x1    = t[idx];   x2 = t[1+idx];
1405       x3    = t[2+idx]; x4 = t[3+idx];
1406       x5    = t[4+idx]; x6 = t[5+idx];
1407       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1408       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1409       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1410       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1411       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1412       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1413       v += 36;
1414     }
1415     idc = 6*(*c--);
1416     v   = aa + 36*diag[i];
1417     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1418                                  v[18]*s4+v[24]*s5+v[30]*s6;
1419     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1420                                  v[19]*s4+v[25]*s5+v[31]*s6;
1421     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1422                                  v[20]*s4+v[26]*s5+v[32]*s6;
1423     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1424                                  v[21]*s4+v[27]*s5+v[33]*s6;
1425     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1426                                  v[22]*s4+v[28]*s5+v[34]*s6;
1427     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1428                                  v[23]*s4+v[29]*s5+v[35]*s6;
1429   }
1430 
1431   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1432   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1433   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1434   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1435   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 #undef __FUNCT__
1440 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1441 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1442 {
1443   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1444   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1445   int             ierr,*diag = a->diag,jdx;
1446   MatScalar       *aa=a->a,*v;
1447   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1448 
1449   PetscFunctionBegin;
1450   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1451   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1452   /* forward solve the lower triangular */
1453   idx    = 0;
1454   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1455   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1456   for (i=1; i<n; i++) {
1457     v     =  aa + 36*ai[i];
1458     vi    =  aj + ai[i];
1459     nz    =  diag[i] - ai[i];
1460     idx   =  6*i;
1461     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1462     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1463     while (nz--) {
1464       jdx   = 6*(*vi++);
1465       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1466       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1467       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1468       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1469       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1470       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1471       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1472       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1473       v += 36;
1474      }
1475     x[idx]   = s1;
1476     x[1+idx] = s2;
1477     x[2+idx] = s3;
1478     x[3+idx] = s4;
1479     x[4+idx] = s5;
1480     x[5+idx] = s6;
1481   }
1482   /* backward solve the upper triangular */
1483   for (i=n-1; i>=0; i--){
1484     v    = aa + 36*diag[i] + 36;
1485     vi   = aj + diag[i] + 1;
1486     nz   = ai[i+1] - diag[i] - 1;
1487     idt  = 6*i;
1488     s1 = x[idt];   s2 = x[1+idt];
1489     s3 = x[2+idt]; s4 = x[3+idt];
1490     s5 = x[4+idt]; s6 = x[5+idt];
1491     while (nz--) {
1492       idx   = 6*(*vi++);
1493       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1494       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1495       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1496       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1497       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1498       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1499       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1500       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1501       v += 36;
1502     }
1503     v        = aa + 36*diag[i];
1504     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1505     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1506     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1507     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1508     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1509     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1510   }
1511 
1512   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1513   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1514   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1515   PetscFunctionReturn(0);
1516 }
1517 
1518 #undef __FUNCT__
1519 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1520 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1521 {
1522   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1523   IS              iscol=a->col,isrow=a->row;
1524   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1525   int             *diag = a->diag;
1526   MatScalar       *aa=a->a,*v;
1527   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1528 
1529   PetscFunctionBegin;
1530   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1531   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1532   t  = a->solve_work;
1533 
1534   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1535   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1536 
1537   /* forward solve the lower triangular */
1538   idx    = 5*(*r++);
1539   t[0] = b[idx];   t[1] = b[1+idx];
1540   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1541   for (i=1; i<n; i++) {
1542     v     = aa + 25*ai[i];
1543     vi    = aj + ai[i];
1544     nz    = diag[i] - ai[i];
1545     idx   = 5*(*r++);
1546     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1547     s5  = b[4+idx];
1548     while (nz--) {
1549       idx   = 5*(*vi++);
1550       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1551       x4    = t[3+idx];x5 = t[4+idx];
1552       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1553       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1554       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1555       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1556       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1557       v += 25;
1558     }
1559     idx = 5*i;
1560     t[idx]   = s1;t[1+idx] = s2;
1561     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1562   }
1563   /* backward solve the upper triangular */
1564   for (i=n-1; i>=0; i--){
1565     v    = aa + 25*diag[i] + 25;
1566     vi   = aj + diag[i] + 1;
1567     nz   = ai[i+1] - diag[i] - 1;
1568     idt  = 5*i;
1569     s1 = t[idt];  s2 = t[1+idt];
1570     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1571     while (nz--) {
1572       idx   = 5*(*vi++);
1573       x1    = t[idx];   x2 = t[1+idx];
1574       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1575       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1576       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1577       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1578       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1579       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1580       v += 25;
1581     }
1582     idc = 5*(*c--);
1583     v   = aa + 25*diag[i];
1584     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1585                                  v[15]*s4+v[20]*s5;
1586     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1587                                  v[16]*s4+v[21]*s5;
1588     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1589                                  v[17]*s4+v[22]*s5;
1590     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1591                                  v[18]*s4+v[23]*s5;
1592     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1593                                  v[19]*s4+v[24]*s5;
1594   }
1595 
1596   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1597   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1598   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1599   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1600   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 #undef __FUNCT__
1605 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1606 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1607 {
1608   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1609   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1610   int             ierr,*diag = a->diag,jdx;
1611   MatScalar       *aa=a->a,*v;
1612   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1613 
1614   PetscFunctionBegin;
1615   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1616   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1617   /* forward solve the lower triangular */
1618   idx    = 0;
1619   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1620   for (i=1; i<n; i++) {
1621     v     =  aa + 25*ai[i];
1622     vi    =  aj + ai[i];
1623     nz    =  diag[i] - ai[i];
1624     idx   =  5*i;
1625     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1626     while (nz--) {
1627       jdx   = 5*(*vi++);
1628       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1629       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1630       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1631       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1632       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1633       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1634       v    += 25;
1635     }
1636     x[idx]   = s1;
1637     x[1+idx] = s2;
1638     x[2+idx] = s3;
1639     x[3+idx] = s4;
1640     x[4+idx] = s5;
1641   }
1642   /* backward solve the upper triangular */
1643   for (i=n-1; i>=0; i--){
1644     v    = aa + 25*diag[i] + 25;
1645     vi   = aj + diag[i] + 1;
1646     nz   = ai[i+1] - diag[i] - 1;
1647     idt  = 5*i;
1648     s1 = x[idt];  s2 = x[1+idt];
1649     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1650     while (nz--) {
1651       idx   = 5*(*vi++);
1652       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1653       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1654       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1655       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1656       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1657       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1658       v    += 25;
1659     }
1660     v        = aa + 25*diag[i];
1661     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1662     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1663     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1664     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1665     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1666   }
1667 
1668   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1669   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1670   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 #undef __FUNCT__
1675 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1676 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1677 {
1678   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1679   IS              iscol=a->col,isrow=a->row;
1680   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1681   int             *diag = a->diag;
1682   MatScalar       *aa=a->a,*v;
1683   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1684 
1685   PetscFunctionBegin;
1686   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1687   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1688   t  = a->solve_work;
1689 
1690   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1691   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1692 
1693   /* forward solve the lower triangular */
1694   idx    = 4*(*r++);
1695   t[0] = b[idx];   t[1] = b[1+idx];
1696   t[2] = b[2+idx]; t[3] = b[3+idx];
1697   for (i=1; i<n; i++) {
1698     v     = aa + 16*ai[i];
1699     vi    = aj + ai[i];
1700     nz    = diag[i] - ai[i];
1701     idx   = 4*(*r++);
1702     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1703     while (nz--) {
1704       idx   = 4*(*vi++);
1705       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1706       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1707       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1708       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1709       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1710       v    += 16;
1711     }
1712     idx        = 4*i;
1713     t[idx]   = s1;t[1+idx] = s2;
1714     t[2+idx] = s3;t[3+idx] = s4;
1715   }
1716   /* backward solve the upper triangular */
1717   for (i=n-1; i>=0; i--){
1718     v    = aa + 16*diag[i] + 16;
1719     vi   = aj + diag[i] + 1;
1720     nz   = ai[i+1] - diag[i] - 1;
1721     idt  = 4*i;
1722     s1 = t[idt];  s2 = t[1+idt];
1723     s3 = t[2+idt];s4 = t[3+idt];
1724     while (nz--) {
1725       idx   = 4*(*vi++);
1726       x1    = t[idx];   x2 = t[1+idx];
1727       x3    = t[2+idx]; x4 = t[3+idx];
1728       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1729       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1730       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1731       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1732       v += 16;
1733     }
1734     idc      = 4*(*c--);
1735     v        = aa + 16*diag[i];
1736     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1737     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1738     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1739     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1740   }
1741 
1742   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1743   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1744   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1745   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1746   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1747   PetscFunctionReturn(0);
1748 }
1749 
1750 #undef __FUNCT__
1751 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1752 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1753 {
1754   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1755   IS              iscol=a->col,isrow=a->row;
1756   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1757   int             *diag = a->diag;
1758   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1759   PetscScalar     *x,*b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t  = (MatScalar *)a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1768 
1769   /* forward solve the lower triangular */
1770   idx    = 4*(*r++);
1771   t[0] = (MatScalar)b[idx];
1772   t[1] = (MatScalar)b[1+idx];
1773   t[2] = (MatScalar)b[2+idx];
1774   t[3] = (MatScalar)b[3+idx];
1775   for (i=1; i<n; i++) {
1776     v     = aa + 16*ai[i];
1777     vi    = aj + ai[i];
1778     nz    = diag[i] - ai[i];
1779     idx   = 4*(*r++);
1780     s1 = (MatScalar)b[idx];
1781     s2 = (MatScalar)b[1+idx];
1782     s3 = (MatScalar)b[2+idx];
1783     s4 = (MatScalar)b[3+idx];
1784     while (nz--) {
1785       idx   = 4*(*vi++);
1786       x1  = t[idx];
1787       x2  = t[1+idx];
1788       x3  = t[2+idx];
1789       x4  = t[3+idx];
1790       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1791       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1792       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1793       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1794       v    += 16;
1795     }
1796     idx        = 4*i;
1797     t[idx]   = s1;
1798     t[1+idx] = s2;
1799     t[2+idx] = s3;
1800     t[3+idx] = s4;
1801   }
1802   /* backward solve the upper triangular */
1803   for (i=n-1; i>=0; i--){
1804     v    = aa + 16*diag[i] + 16;
1805     vi   = aj + diag[i] + 1;
1806     nz   = ai[i+1] - diag[i] - 1;
1807     idt  = 4*i;
1808     s1 = t[idt];
1809     s2 = t[1+idt];
1810     s3 = t[2+idt];
1811     s4 = t[3+idt];
1812     while (nz--) {
1813       idx   = 4*(*vi++);
1814       x1  = t[idx];
1815       x2  = t[1+idx];
1816       x3  = t[2+idx];
1817       x4  = t[3+idx];
1818       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1819       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1820       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1821       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1822       v += 16;
1823     }
1824     idc      = 4*(*c--);
1825     v        = aa + 16*diag[i];
1826     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1827     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1828     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1829     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1830     x[idc]   = (PetscScalar)t[idt];
1831     x[1+idc] = (PetscScalar)t[1+idt];
1832     x[2+idc] = (PetscScalar)t[2+idt];
1833     x[3+idc] = (PetscScalar)t[3+idt];
1834  }
1835 
1836   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1837   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1838   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1839   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1840   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1841   PetscFunctionReturn(0);
1842 }
1843 
1844 #if defined (PETSC_HAVE_SSE)
1845 
1846 #include PETSC_HAVE_SSE
1847 
1848 #undef __FUNCT__
1849 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1850 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1851 {
1852   /*
1853      Note: This code uses demotion of double
1854      to float when performing the mixed-mode computation.
1855      This may not be numerically reasonable for all applications.
1856   */
1857   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1858   IS              iscol=a->col,isrow=a->row;
1859   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1860   int             *diag = a->diag,ai16;
1861   MatScalar       *aa=a->a,*v;
1862   PetscScalar     *x,*b,*t;
1863 
1864   /* Make space in temp stack for 16 Byte Aligned arrays */
1865   float           ssealignedspace[11],*tmps,*tmpx;
1866   unsigned long   offset;
1867 
1868   PetscFunctionBegin;
1869   SSE_SCOPE_BEGIN;
1870 
1871     offset = (unsigned long)ssealignedspace % 16;
1872     if (offset) offset = (16 - offset)/4;
1873     tmps = &ssealignedspace[offset];
1874     tmpx = &ssealignedspace[offset+4];
1875     PREFETCH_NTA(aa+16*ai[1]);
1876 
1877     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1878     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1879     t  = a->solve_work;
1880 
1881     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1882     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1883 
1884     /* forward solve the lower triangular */
1885     idx  = 4*(*r++);
1886     t[0] = b[idx];   t[1] = b[1+idx];
1887     t[2] = b[2+idx]; t[3] = b[3+idx];
1888     v    =  aa + 16*ai[1];
1889 
1890     for (i=1; i<n;) {
1891       PREFETCH_NTA(&v[8]);
1892       vi   =  aj      + ai[i];
1893       nz   =  diag[i] - ai[i];
1894       idx  =  4*(*r++);
1895 
1896       /* Demote sum from double to float */
1897       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1898       LOAD_PS(tmps,XMM7);
1899 
1900       while (nz--) {
1901         PREFETCH_NTA(&v[16]);
1902         idx = 4*(*vi++);
1903 
1904         /* Demote solution (so far) from double to float */
1905         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1906 
1907         /* 4x4 Matrix-Vector product with negative accumulation: */
1908         SSE_INLINE_BEGIN_2(tmpx,v)
1909           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1910 
1911           /* First Column */
1912           SSE_COPY_PS(XMM0,XMM6)
1913           SSE_SHUFFLE(XMM0,XMM0,0x00)
1914           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1915           SSE_SUB_PS(XMM7,XMM0)
1916 
1917           /* Second Column */
1918           SSE_COPY_PS(XMM1,XMM6)
1919           SSE_SHUFFLE(XMM1,XMM1,0x55)
1920           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1921           SSE_SUB_PS(XMM7,XMM1)
1922 
1923           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1924 
1925           /* Third Column */
1926           SSE_COPY_PS(XMM2,XMM6)
1927           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1928           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1929           SSE_SUB_PS(XMM7,XMM2)
1930 
1931           /* Fourth Column */
1932           SSE_COPY_PS(XMM3,XMM6)
1933           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1934           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1935           SSE_SUB_PS(XMM7,XMM3)
1936         SSE_INLINE_END_2
1937 
1938         v  += 16;
1939       }
1940       idx = 4*i;
1941       v   = aa + 16*ai[++i];
1942       PREFETCH_NTA(v);
1943       STORE_PS(tmps,XMM7);
1944 
1945       /* Promote result from float to double */
1946       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1947     }
1948     /* backward solve the upper triangular */
1949     idt  = 4*(n-1);
1950     ai16 = 16*diag[n-1];
1951     v    = aa + ai16 + 16;
1952     for (i=n-1; i>=0;){
1953       PREFETCH_NTA(&v[8]);
1954       vi = aj + diag[i] + 1;
1955       nz = ai[i+1] - diag[i] - 1;
1956 
1957       /* Demote accumulator from double to float */
1958       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1959       LOAD_PS(tmps,XMM7);
1960 
1961       while (nz--) {
1962         PREFETCH_NTA(&v[16]);
1963         idx = 4*(*vi++);
1964 
1965         /* Demote solution (so far) from double to float */
1966         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1967 
1968         /* 4x4 Matrix-Vector Product with negative accumulation: */
1969         SSE_INLINE_BEGIN_2(tmpx,v)
1970           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1971 
1972           /* First Column */
1973           SSE_COPY_PS(XMM0,XMM6)
1974           SSE_SHUFFLE(XMM0,XMM0,0x00)
1975           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1976           SSE_SUB_PS(XMM7,XMM0)
1977 
1978           /* Second Column */
1979           SSE_COPY_PS(XMM1,XMM6)
1980           SSE_SHUFFLE(XMM1,XMM1,0x55)
1981           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1982           SSE_SUB_PS(XMM7,XMM1)
1983 
1984           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1985 
1986           /* Third Column */
1987           SSE_COPY_PS(XMM2,XMM6)
1988           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1989           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1990           SSE_SUB_PS(XMM7,XMM2)
1991 
1992           /* Fourth Column */
1993           SSE_COPY_PS(XMM3,XMM6)
1994           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1995           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1996           SSE_SUB_PS(XMM7,XMM3)
1997         SSE_INLINE_END_2
1998         v  += 16;
1999       }
2000       v    = aa + ai16;
2001       ai16 = 16*diag[--i];
2002       PREFETCH_NTA(aa+ai16+16);
2003       /*
2004          Scale the result by the diagonal 4x4 block,
2005          which was inverted as part of the factorization
2006       */
2007       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2008         /* First Column */
2009         SSE_COPY_PS(XMM0,XMM7)
2010         SSE_SHUFFLE(XMM0,XMM0,0x00)
2011         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2012 
2013         /* Second Column */
2014         SSE_COPY_PS(XMM1,XMM7)
2015         SSE_SHUFFLE(XMM1,XMM1,0x55)
2016         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2017         SSE_ADD_PS(XMM0,XMM1)
2018 
2019         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2020 
2021         /* Third Column */
2022         SSE_COPY_PS(XMM2,XMM7)
2023         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2024         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2025         SSE_ADD_PS(XMM0,XMM2)
2026 
2027         /* Fourth Column */
2028         SSE_COPY_PS(XMM3,XMM7)
2029         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2030         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2031         SSE_ADD_PS(XMM0,XMM3)
2032 
2033         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2034       SSE_INLINE_END_3
2035 
2036       /* Promote solution from float to double */
2037       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2038 
2039       /* Apply reordering to t and stream into x.    */
2040       /* This way, x doesn't pollute the cache.      */
2041       /* Be careful with size: 2 doubles = 4 floats! */
2042       idc  = 4*(*c--);
2043       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2044         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2045         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2046         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2047         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2048         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2049         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2050       SSE_INLINE_END_2
2051       v    = aa + ai16 + 16;
2052       idt -= 4;
2053     }
2054 
2055     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2056     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2057     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2058     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2059     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2060   SSE_SCOPE_END;
2061   PetscFunctionReturn(0);
2062 }
2063 
2064 #endif
2065 
2066 
2067 /*
2068       Special case where the matrix was ILU(0) factored in the natural
2069    ordering. This eliminates the need for the column and row permutation.
2070 */
2071 #undef __FUNCT__
2072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2073 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2074 {
2075   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2076   int             n=a->mbs,*ai=a->i,*aj=a->j;
2077   int             ierr,*diag = a->diag;
2078   MatScalar       *aa=a->a;
2079   PetscScalar     *x,*b;
2080 
2081   PetscFunctionBegin;
2082   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2083   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2084 
2085 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2086   {
2087     static PetscScalar w[2000]; /* very BAD need to fix */
2088     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2089   }
2090 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2091   {
2092     static PetscScalar w[2000]; /* very BAD need to fix */
2093     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2094   }
2095 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2096   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2097 #else
2098   {
2099     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2100     MatScalar    *v;
2101     int          jdx,idt,idx,nz,*vi,i,ai16;
2102 
2103   /* forward solve the lower triangular */
2104   idx    = 0;
2105   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2106   for (i=1; i<n; i++) {
2107     v     =  aa      + 16*ai[i];
2108     vi    =  aj      + ai[i];
2109     nz    =  diag[i] - ai[i];
2110     idx   +=  4;
2111     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2112     while (nz--) {
2113       jdx   = 4*(*vi++);
2114       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2115       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2116       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2117       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2118       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2119       v    += 16;
2120     }
2121     x[idx]   = s1;
2122     x[1+idx] = s2;
2123     x[2+idx] = s3;
2124     x[3+idx] = s4;
2125   }
2126   /* backward solve the upper triangular */
2127   idt = 4*(n-1);
2128   for (i=n-1; i>=0; i--){
2129     ai16 = 16*diag[i];
2130     v    = aa + ai16 + 16;
2131     vi   = aj + diag[i] + 1;
2132     nz   = ai[i+1] - diag[i] - 1;
2133     s1 = x[idt];  s2 = x[1+idt];
2134     s3 = x[2+idt];s4 = x[3+idt];
2135     while (nz--) {
2136       idx   = 4*(*vi++);
2137       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2138       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2139       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2140       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2141       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2142       v    += 16;
2143     }
2144     v        = aa + ai16;
2145     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2146     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2147     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2148     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2149     idt -= 4;
2150   }
2151   }
2152 #endif
2153 
2154   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2155   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2156   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2157   PetscFunctionReturn(0);
2158 }
2159 
2160 #undef __FUNCT__
2161 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2162 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2163 {
2164   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2165   int             n=a->mbs,*ai=a->i,*aj=a->j;
2166   int             ierr,*diag = a->diag;
2167   MatScalar       *aa=a->a;
2168   PetscScalar     *x,*b;
2169 
2170   PetscFunctionBegin;
2171   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2172   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2173 
2174   {
2175     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2176     MatScalar  *v,*t=(MatScalar *)x;
2177     int        jdx,idt,idx,nz,*vi,i,ai16;
2178 
2179     /* forward solve the lower triangular */
2180     idx  = 0;
2181     t[0] = (MatScalar)b[0];
2182     t[1] = (MatScalar)b[1];
2183     t[2] = (MatScalar)b[2];
2184     t[3] = (MatScalar)b[3];
2185     for (i=1; i<n; i++) {
2186       v     =  aa      + 16*ai[i];
2187       vi    =  aj      + ai[i];
2188       nz    =  diag[i] - ai[i];
2189       idx   +=  4;
2190       s1 = (MatScalar)b[idx];
2191       s2 = (MatScalar)b[1+idx];
2192       s3 = (MatScalar)b[2+idx];
2193       s4 = (MatScalar)b[3+idx];
2194       while (nz--) {
2195         jdx = 4*(*vi++);
2196         x1  = t[jdx];
2197         x2  = t[1+jdx];
2198         x3  = t[2+jdx];
2199         x4  = t[3+jdx];
2200         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2201         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2202         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2203         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2204         v    += 16;
2205       }
2206       t[idx]   = s1;
2207       t[1+idx] = s2;
2208       t[2+idx] = s3;
2209       t[3+idx] = s4;
2210     }
2211     /* backward solve the upper triangular */
2212     idt = 4*(n-1);
2213     for (i=n-1; i>=0; i--){
2214       ai16 = 16*diag[i];
2215       v    = aa + ai16 + 16;
2216       vi   = aj + diag[i] + 1;
2217       nz   = ai[i+1] - diag[i] - 1;
2218       s1   = t[idt];
2219       s2   = t[1+idt];
2220       s3   = t[2+idt];
2221       s4   = t[3+idt];
2222       while (nz--) {
2223         idx = 4*(*vi++);
2224         x1  = (MatScalar)x[idx];
2225         x2  = (MatScalar)x[1+idx];
2226         x3  = (MatScalar)x[2+idx];
2227         x4  = (MatScalar)x[3+idx];
2228         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2229         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2230         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2231         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2232         v    += 16;
2233       }
2234       v        = aa + ai16;
2235       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2236       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2237       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2238       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2239       idt -= 4;
2240     }
2241   }
2242 
2243   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2245   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2246   PetscFunctionReturn(0);
2247 }
2248 
2249 #if defined (PETSC_HAVE_SSE)
2250 
2251 #include PETSC_HAVE_SSE
2252 #undef __FUNCT__
2253 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2254 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2255 {
2256   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2257   unsigned short *aj=(unsigned short *)a->j;
2258   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2259   MatScalar      *aa=a->a;
2260   PetscScalar    *x,*b;
2261 
2262   PetscFunctionBegin;
2263   SSE_SCOPE_BEGIN;
2264   /*
2265      Note: This code currently uses demotion of double
2266      to float when performing the mixed-mode computation.
2267      This may not be numerically reasonable for all applications.
2268   */
2269   PREFETCH_NTA(aa+16*ai[1]);
2270 
2271   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2273   {
2274     /* x will first be computed in single precision then promoted inplace to double */
2275     MatScalar      *v,*t=(MatScalar *)x;
2276     int            nz,i,idt,ai16;
2277     unsigned int   jdx,idx;
2278     unsigned short *vi;
2279     /* Forward solve the lower triangular factor. */
2280 
2281     /* First block is the identity. */
2282     idx  = 0;
2283     CONVERT_DOUBLE4_FLOAT4(t,b);
2284     v    =  aa + 16*((unsigned int)ai[1]);
2285 
2286     for (i=1; i<n;) {
2287       PREFETCH_NTA(&v[8]);
2288       vi   =  aj      + ai[i];
2289       nz   =  diag[i] - ai[i];
2290       idx +=  4;
2291 
2292       /* Demote RHS from double to float. */
2293       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2294       LOAD_PS(&t[idx],XMM7);
2295 
2296       while (nz--) {
2297         PREFETCH_NTA(&v[16]);
2298         jdx = 4*((unsigned int)(*vi++));
2299 
2300         /* 4x4 Matrix-Vector product with negative accumulation: */
2301         SSE_INLINE_BEGIN_2(&t[jdx],v)
2302           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2303 
2304           /* First Column */
2305           SSE_COPY_PS(XMM0,XMM6)
2306           SSE_SHUFFLE(XMM0,XMM0,0x00)
2307           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2308           SSE_SUB_PS(XMM7,XMM0)
2309 
2310           /* Second Column */
2311           SSE_COPY_PS(XMM1,XMM6)
2312           SSE_SHUFFLE(XMM1,XMM1,0x55)
2313           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2314           SSE_SUB_PS(XMM7,XMM1)
2315 
2316           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2317 
2318           /* Third Column */
2319           SSE_COPY_PS(XMM2,XMM6)
2320           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2321           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2322           SSE_SUB_PS(XMM7,XMM2)
2323 
2324           /* Fourth Column */
2325           SSE_COPY_PS(XMM3,XMM6)
2326           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2327           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2328           SSE_SUB_PS(XMM7,XMM3)
2329         SSE_INLINE_END_2
2330 
2331         v  += 16;
2332       }
2333       v    =  aa + 16*ai[++i];
2334       PREFETCH_NTA(v);
2335       STORE_PS(&t[idx],XMM7);
2336     }
2337 
2338     /* Backward solve the upper triangular factor.*/
2339 
2340     idt  = 4*(n-1);
2341     ai16 = 16*diag[n-1];
2342     v    = aa + ai16 + 16;
2343     for (i=n-1; i>=0;){
2344       PREFETCH_NTA(&v[8]);
2345       vi = aj + diag[i] + 1;
2346       nz = ai[i+1] - diag[i] - 1;
2347 
2348       LOAD_PS(&t[idt],XMM7);
2349 
2350       while (nz--) {
2351         PREFETCH_NTA(&v[16]);
2352         idx = 4*((unsigned int)(*vi++));
2353 
2354         /* 4x4 Matrix-Vector Product with negative accumulation: */
2355         SSE_INLINE_BEGIN_2(&t[idx],v)
2356           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2357 
2358           /* First Column */
2359           SSE_COPY_PS(XMM0,XMM6)
2360           SSE_SHUFFLE(XMM0,XMM0,0x00)
2361           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2362           SSE_SUB_PS(XMM7,XMM0)
2363 
2364           /* Second Column */
2365           SSE_COPY_PS(XMM1,XMM6)
2366           SSE_SHUFFLE(XMM1,XMM1,0x55)
2367           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2368           SSE_SUB_PS(XMM7,XMM1)
2369 
2370           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2371 
2372           /* Third Column */
2373           SSE_COPY_PS(XMM2,XMM6)
2374           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2375           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2376           SSE_SUB_PS(XMM7,XMM2)
2377 
2378           /* Fourth Column */
2379           SSE_COPY_PS(XMM3,XMM6)
2380           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2381           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2382           SSE_SUB_PS(XMM7,XMM3)
2383         SSE_INLINE_END_2
2384         v  += 16;
2385       }
2386       v    = aa + ai16;
2387       ai16 = 16*diag[--i];
2388       PREFETCH_NTA(aa+ai16+16);
2389       /*
2390          Scale the result by the diagonal 4x4 block,
2391          which was inverted as part of the factorization
2392       */
2393       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2394         /* First Column */
2395         SSE_COPY_PS(XMM0,XMM7)
2396         SSE_SHUFFLE(XMM0,XMM0,0x00)
2397         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2398 
2399         /* Second Column */
2400         SSE_COPY_PS(XMM1,XMM7)
2401         SSE_SHUFFLE(XMM1,XMM1,0x55)
2402         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2403         SSE_ADD_PS(XMM0,XMM1)
2404 
2405         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2406 
2407         /* Third Column */
2408         SSE_COPY_PS(XMM2,XMM7)
2409         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2410         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2411         SSE_ADD_PS(XMM0,XMM2)
2412 
2413         /* Fourth Column */
2414         SSE_COPY_PS(XMM3,XMM7)
2415         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2416         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2417         SSE_ADD_PS(XMM0,XMM3)
2418 
2419         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2420       SSE_INLINE_END_3
2421 
2422       v    = aa + ai16 + 16;
2423       idt -= 4;
2424     }
2425 
2426     /* Convert t from single precision back to double precision (inplace)*/
2427     idt = 4*(n-1);
2428     for (i=n-1;i>=0;i--) {
2429       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2430       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2431       PetscScalar *xtemp=&x[idt];
2432       MatScalar   *ttemp=&t[idt];
2433       xtemp[3] = (PetscScalar)ttemp[3];
2434       xtemp[2] = (PetscScalar)ttemp[2];
2435       xtemp[1] = (PetscScalar)ttemp[1];
2436       xtemp[0] = (PetscScalar)ttemp[0];
2437       idt -= 4;
2438     }
2439 
2440   } /* End of artificial scope. */
2441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2443   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2444   SSE_SCOPE_END;
2445   PetscFunctionReturn(0);
2446 }
2447 
2448 #undef __FUNCT__
2449 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2450 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2451 {
2452   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2453   int            *aj=a->j;
2454   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2455   MatScalar      *aa=a->a;
2456   PetscScalar    *x,*b;
2457 
2458   PetscFunctionBegin;
2459   SSE_SCOPE_BEGIN;
2460   /*
2461      Note: This code currently uses demotion of double
2462      to float when performing the mixed-mode computation.
2463      This may not be numerically reasonable for all applications.
2464   */
2465   PREFETCH_NTA(aa+16*ai[1]);
2466 
2467   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2468   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2469   {
2470     /* x will first be computed in single precision then promoted inplace to double */
2471     MatScalar *v,*t=(MatScalar *)x;
2472     int       nz,i,idt,ai16;
2473     int       jdx,idx;
2474     int       *vi;
2475     /* Forward solve the lower triangular factor. */
2476 
2477     /* First block is the identity. */
2478     idx  = 0;
2479     CONVERT_DOUBLE4_FLOAT4(t,b);
2480     v    =  aa + 16*ai[1];
2481 
2482     for (i=1; i<n;) {
2483       PREFETCH_NTA(&v[8]);
2484       vi   =  aj      + ai[i];
2485       nz   =  diag[i] - ai[i];
2486       idx +=  4;
2487 
2488       /* Demote RHS from double to float. */
2489       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2490       LOAD_PS(&t[idx],XMM7);
2491 
2492       while (nz--) {
2493         PREFETCH_NTA(&v[16]);
2494         jdx = 4*(*vi++);
2495 /*          jdx = *vi++; */
2496 
2497         /* 4x4 Matrix-Vector product with negative accumulation: */
2498         SSE_INLINE_BEGIN_2(&t[jdx],v)
2499           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2500 
2501           /* First Column */
2502           SSE_COPY_PS(XMM0,XMM6)
2503           SSE_SHUFFLE(XMM0,XMM0,0x00)
2504           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2505           SSE_SUB_PS(XMM7,XMM0)
2506 
2507           /* Second Column */
2508           SSE_COPY_PS(XMM1,XMM6)
2509           SSE_SHUFFLE(XMM1,XMM1,0x55)
2510           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2511           SSE_SUB_PS(XMM7,XMM1)
2512 
2513           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2514 
2515           /* Third Column */
2516           SSE_COPY_PS(XMM2,XMM6)
2517           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2518           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2519           SSE_SUB_PS(XMM7,XMM2)
2520 
2521           /* Fourth Column */
2522           SSE_COPY_PS(XMM3,XMM6)
2523           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2524           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2525           SSE_SUB_PS(XMM7,XMM3)
2526         SSE_INLINE_END_2
2527 
2528         v  += 16;
2529       }
2530       v    =  aa + 16*ai[++i];
2531       PREFETCH_NTA(v);
2532       STORE_PS(&t[idx],XMM7);
2533     }
2534 
2535     /* Backward solve the upper triangular factor.*/
2536 
2537     idt  = 4*(n-1);
2538     ai16 = 16*diag[n-1];
2539     v    = aa + ai16 + 16;
2540     for (i=n-1; i>=0;){
2541       PREFETCH_NTA(&v[8]);
2542       vi = aj + diag[i] + 1;
2543       nz = ai[i+1] - diag[i] - 1;
2544 
2545       LOAD_PS(&t[idt],XMM7);
2546 
2547       while (nz--) {
2548         PREFETCH_NTA(&v[16]);
2549         idx = 4*(*vi++);
2550 /*          idx = *vi++; */
2551 
2552         /* 4x4 Matrix-Vector Product with negative accumulation: */
2553         SSE_INLINE_BEGIN_2(&t[idx],v)
2554           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2555 
2556           /* First Column */
2557           SSE_COPY_PS(XMM0,XMM6)
2558           SSE_SHUFFLE(XMM0,XMM0,0x00)
2559           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2560           SSE_SUB_PS(XMM7,XMM0)
2561 
2562           /* Second Column */
2563           SSE_COPY_PS(XMM1,XMM6)
2564           SSE_SHUFFLE(XMM1,XMM1,0x55)
2565           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2566           SSE_SUB_PS(XMM7,XMM1)
2567 
2568           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2569 
2570           /* Third Column */
2571           SSE_COPY_PS(XMM2,XMM6)
2572           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2573           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2574           SSE_SUB_PS(XMM7,XMM2)
2575 
2576           /* Fourth Column */
2577           SSE_COPY_PS(XMM3,XMM6)
2578           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2579           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2580           SSE_SUB_PS(XMM7,XMM3)
2581         SSE_INLINE_END_2
2582         v  += 16;
2583       }
2584       v    = aa + ai16;
2585       ai16 = 16*diag[--i];
2586       PREFETCH_NTA(aa+ai16+16);
2587       /*
2588          Scale the result by the diagonal 4x4 block,
2589          which was inverted as part of the factorization
2590       */
2591       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2592         /* First Column */
2593         SSE_COPY_PS(XMM0,XMM7)
2594         SSE_SHUFFLE(XMM0,XMM0,0x00)
2595         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2596 
2597         /* Second Column */
2598         SSE_COPY_PS(XMM1,XMM7)
2599         SSE_SHUFFLE(XMM1,XMM1,0x55)
2600         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2601         SSE_ADD_PS(XMM0,XMM1)
2602 
2603         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2604 
2605         /* Third Column */
2606         SSE_COPY_PS(XMM2,XMM7)
2607         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2608         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2609         SSE_ADD_PS(XMM0,XMM2)
2610 
2611         /* Fourth Column */
2612         SSE_COPY_PS(XMM3,XMM7)
2613         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2614         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2615         SSE_ADD_PS(XMM0,XMM3)
2616 
2617         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2618       SSE_INLINE_END_3
2619 
2620       v    = aa + ai16 + 16;
2621       idt -= 4;
2622     }
2623 
2624     /* Convert t from single precision back to double precision (inplace)*/
2625     idt = 4*(n-1);
2626     for (i=n-1;i>=0;i--) {
2627       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2628       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2629       PetscScalar *xtemp=&x[idt];
2630       MatScalar   *ttemp=&t[idt];
2631       xtemp[3] = (PetscScalar)ttemp[3];
2632       xtemp[2] = (PetscScalar)ttemp[2];
2633       xtemp[1] = (PetscScalar)ttemp[1];
2634       xtemp[0] = (PetscScalar)ttemp[0];
2635       idt -= 4;
2636     }
2637 
2638   } /* End of artificial scope. */
2639   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2641   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2642   SSE_SCOPE_END;
2643   PetscFunctionReturn(0);
2644 }
2645 
2646 #endif
2647 #undef __FUNCT__
2648 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2649 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2650 {
2651   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2652   IS              iscol=a->col,isrow=a->row;
2653   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2654   int             *diag = a->diag;
2655   MatScalar       *aa=a->a,*v;
2656   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
2657 
2658   PetscFunctionBegin;
2659   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2660   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2661   t  = a->solve_work;
2662 
2663   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2664   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2665 
2666   /* forward solve the lower triangular */
2667   idx    = 3*(*r++);
2668   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2669   for (i=1; i<n; i++) {
2670     v     = aa + 9*ai[i];
2671     vi    = aj + ai[i];
2672     nz    = diag[i] - ai[i];
2673     idx   = 3*(*r++);
2674     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2675     while (nz--) {
2676       idx   = 3*(*vi++);
2677       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2678       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2679       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2680       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2681       v += 9;
2682     }
2683     idx = 3*i;
2684     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2685   }
2686   /* backward solve the upper triangular */
2687   for (i=n-1; i>=0; i--){
2688     v    = aa + 9*diag[i] + 9;
2689     vi   = aj + diag[i] + 1;
2690     nz   = ai[i+1] - diag[i] - 1;
2691     idt  = 3*i;
2692     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2693     while (nz--) {
2694       idx   = 3*(*vi++);
2695       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2696       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2697       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2698       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2699       v += 9;
2700     }
2701     idc = 3*(*c--);
2702     v   = aa + 9*diag[i];
2703     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2704     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2705     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2706   }
2707   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2708   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2709   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2711   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2712   PetscFunctionReturn(0);
2713 }
2714 
2715 /*
2716       Special case where the matrix was ILU(0) factored in the natural
2717    ordering. This eliminates the need for the column and row permutation.
2718 */
2719 #undef __FUNCT__
2720 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2721 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2722 {
2723   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2724   int             n=a->mbs,*ai=a->i,*aj=a->j;
2725   int             ierr,*diag = a->diag;
2726   MatScalar       *aa=a->a,*v;
2727   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2728   int             jdx,idt,idx,nz,*vi,i;
2729 
2730   PetscFunctionBegin;
2731   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2732   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2733 
2734 
2735   /* forward solve the lower triangular */
2736   idx    = 0;
2737   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2738   for (i=1; i<n; i++) {
2739     v     =  aa      + 9*ai[i];
2740     vi    =  aj      + ai[i];
2741     nz    =  diag[i] - ai[i];
2742     idx   +=  3;
2743     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2744     while (nz--) {
2745       jdx   = 3*(*vi++);
2746       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2747       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2748       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2749       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2750       v    += 9;
2751     }
2752     x[idx]   = s1;
2753     x[1+idx] = s2;
2754     x[2+idx] = s3;
2755   }
2756   /* backward solve the upper triangular */
2757   for (i=n-1; i>=0; i--){
2758     v    = aa + 9*diag[i] + 9;
2759     vi   = aj + diag[i] + 1;
2760     nz   = ai[i+1] - diag[i] - 1;
2761     idt  = 3*i;
2762     s1 = x[idt];  s2 = x[1+idt];
2763     s3 = x[2+idt];
2764     while (nz--) {
2765       idx   = 3*(*vi++);
2766       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2767       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2768       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2769       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2770       v    += 9;
2771     }
2772     v        = aa +  9*diag[i];
2773     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2774     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2775     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2776   }
2777 
2778   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2779   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2780   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2781   PetscFunctionReturn(0);
2782 }
2783 
2784 #undef __FUNCT__
2785 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2786 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2787 {
2788   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2789   IS              iscol=a->col,isrow=a->row;
2790   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2791   int             *diag = a->diag;
2792   MatScalar       *aa=a->a,*v;
2793   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
2794 
2795   PetscFunctionBegin;
2796   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2797   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2798   t  = a->solve_work;
2799 
2800   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2801   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2802 
2803   /* forward solve the lower triangular */
2804   idx    = 2*(*r++);
2805   t[0] = b[idx]; t[1] = b[1+idx];
2806   for (i=1; i<n; i++) {
2807     v     = aa + 4*ai[i];
2808     vi    = aj + ai[i];
2809     nz    = diag[i] - ai[i];
2810     idx   = 2*(*r++);
2811     s1  = b[idx]; s2 = b[1+idx];
2812     while (nz--) {
2813       idx   = 2*(*vi++);
2814       x1    = t[idx]; x2 = t[1+idx];
2815       s1 -= v[0]*x1 + v[2]*x2;
2816       s2 -= v[1]*x1 + v[3]*x2;
2817       v += 4;
2818     }
2819     idx = 2*i;
2820     t[idx] = s1; t[1+idx] = s2;
2821   }
2822   /* backward solve the upper triangular */
2823   for (i=n-1; i>=0; i--){
2824     v    = aa + 4*diag[i] + 4;
2825     vi   = aj + diag[i] + 1;
2826     nz   = ai[i+1] - diag[i] - 1;
2827     idt  = 2*i;
2828     s1 = t[idt]; s2 = t[1+idt];
2829     while (nz--) {
2830       idx   = 2*(*vi++);
2831       x1    = t[idx]; x2 = t[1+idx];
2832       s1 -= v[0]*x1 + v[2]*x2;
2833       s2 -= v[1]*x1 + v[3]*x2;
2834       v += 4;
2835     }
2836     idc = 2*(*c--);
2837     v   = aa + 4*diag[i];
2838     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2839     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2840   }
2841   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2842   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2843   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2844   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2845   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2846   PetscFunctionReturn(0);
2847 }
2848 
2849 /*
2850       Special case where the matrix was ILU(0) factored in the natural
2851    ordering. This eliminates the need for the column and row permutation.
2852 */
2853 #undef __FUNCT__
2854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2855 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2856 {
2857   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2858   int             n=a->mbs,*ai=a->i,*aj=a->j;
2859   int             ierr,*diag = a->diag;
2860   MatScalar       *aa=a->a,*v;
2861   PetscScalar     *x,*b,s1,s2,x1,x2;
2862   int             jdx,idt,idx,nz,*vi,i;
2863 
2864   PetscFunctionBegin;
2865   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2866   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2867 
2868   /* forward solve the lower triangular */
2869   idx    = 0;
2870   x[0]   = b[0]; x[1] = b[1];
2871   for (i=1; i<n; i++) {
2872     v     =  aa      + 4*ai[i];
2873     vi    =  aj      + ai[i];
2874     nz    =  diag[i] - ai[i];
2875     idx   +=  2;
2876     s1  =  b[idx];s2 = b[1+idx];
2877     while (nz--) {
2878       jdx   = 2*(*vi++);
2879       x1    = x[jdx];x2 = x[1+jdx];
2880       s1 -= v[0]*x1 + v[2]*x2;
2881       s2 -= v[1]*x1 + v[3]*x2;
2882       v    += 4;
2883     }
2884     x[idx]   = s1;
2885     x[1+idx] = s2;
2886   }
2887   /* backward solve the upper triangular */
2888   for (i=n-1; i>=0; i--){
2889     v    = aa + 4*diag[i] + 4;
2890     vi   = aj + diag[i] + 1;
2891     nz   = ai[i+1] - diag[i] - 1;
2892     idt  = 2*i;
2893     s1 = x[idt];  s2 = x[1+idt];
2894     while (nz--) {
2895       idx   = 2*(*vi++);
2896       x1    = x[idx];   x2 = x[1+idx];
2897       s1 -= v[0]*x1 + v[2]*x2;
2898       s2 -= v[1]*x1 + v[3]*x2;
2899       v    += 4;
2900     }
2901     v        = aa +  4*diag[i];
2902     x[idt]   = v[0]*s1 + v[2]*s2;
2903     x[1+idt] = v[1]*s1 + v[3]*s2;
2904   }
2905 
2906   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2907   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2908   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 #undef __FUNCT__
2913 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2914 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2915 {
2916   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2917   IS              iscol=a->col,isrow=a->row;
2918   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2919   int             *diag = a->diag;
2920   MatScalar       *aa=a->a,*v;
2921   PetscScalar     *x,*b,s1,*t;
2922 
2923   PetscFunctionBegin;
2924   if (!n) PetscFunctionReturn(0);
2925 
2926   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2927   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2928   t  = a->solve_work;
2929 
2930   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2931   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2932 
2933   /* forward solve the lower triangular */
2934   t[0] = b[*r++];
2935   for (i=1; i<n; i++) {
2936     v     = aa + ai[i];
2937     vi    = aj + ai[i];
2938     nz    = diag[i] - ai[i];
2939     s1  = b[*r++];
2940     while (nz--) {
2941       s1 -= (*v++)*t[*vi++];
2942     }
2943     t[i] = s1;
2944   }
2945   /* backward solve the upper triangular */
2946   for (i=n-1; i>=0; i--){
2947     v    = aa + diag[i] + 1;
2948     vi   = aj + diag[i] + 1;
2949     nz   = ai[i+1] - diag[i] - 1;
2950     s1 = t[i];
2951     while (nz--) {
2952       s1 -= (*v++)*t[*vi++];
2953     }
2954     x[*c--] = t[i] = aa[diag[i]]*s1;
2955   }
2956 
2957   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2958   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2959   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2960   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2961   PetscLogFlops(2*1*(a->nz) - A->n);
2962   PetscFunctionReturn(0);
2963 }
2964 /*
2965       Special case where the matrix was ILU(0) factored in the natural
2966    ordering. This eliminates the need for the column and row permutation.
2967 */
2968 #undef __FUNCT__
2969 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2970 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2971 {
2972   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2973   int             n=a->mbs,*ai=a->i,*aj=a->j;
2974   int             ierr,*diag = a->diag;
2975   MatScalar       *aa=a->a;
2976   PetscScalar     *x,*b;
2977   PetscScalar     s1,x1;
2978   MatScalar       *v;
2979   int             jdx,idt,idx,nz,*vi,i;
2980 
2981   PetscFunctionBegin;
2982   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2983   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2984 
2985   /* forward solve the lower triangular */
2986   idx    = 0;
2987   x[0]   = b[0];
2988   for (i=1; i<n; i++) {
2989     v     =  aa      + ai[i];
2990     vi    =  aj      + ai[i];
2991     nz    =  diag[i] - ai[i];
2992     idx   +=  1;
2993     s1  =  b[idx];
2994     while (nz--) {
2995       jdx   = *vi++;
2996       x1    = x[jdx];
2997       s1 -= v[0]*x1;
2998       v    += 1;
2999     }
3000     x[idx]   = s1;
3001   }
3002   /* backward solve the upper triangular */
3003   for (i=n-1; i>=0; i--){
3004     v    = aa + diag[i] + 1;
3005     vi   = aj + diag[i] + 1;
3006     nz   = ai[i+1] - diag[i] - 1;
3007     idt  = i;
3008     s1 = x[idt];
3009     while (nz--) {
3010       idx   = *vi++;
3011       x1    = x[idx];
3012       s1 -= v[0]*x1;
3013       v    += 1;
3014     }
3015     v        = aa +  diag[i];
3016     x[idt]   = v[0]*s1;
3017   }
3018   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3019   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3020   PetscLogFlops(2*(a->nz) - A->n);
3021   PetscFunctionReturn(0);
3022 }
3023 
3024 /* ----------------------------------------------------------------*/
3025 /*
3026      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3027    except that the data structure of Mat_SeqAIJ is slightly different.
3028    Not a good example of code reuse.
3029 */
3030 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3031 
3032 #undef __FUNCT__
3033 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3034 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3035 {
3036   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3037   IS          isicol;
3038   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3039   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3040   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3041   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3042   PetscTruth  col_identity,row_identity;
3043   PetscReal   f;
3044 
3045   PetscFunctionBegin;
3046   f             = info->fill;
3047   levels        = (int)info->levels;
3048   diagonal_fill = (int)info->diagonal_fill;
3049   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3050   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3051   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3052 
3053   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3054     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3055     (*fact)->factor = FACTOR_LU;
3056     b               = (Mat_SeqBAIJ*)(*fact)->data;
3057     if (!b->diag) {
3058       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3059     }
3060     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3061     b->row        = isrow;
3062     b->col        = iscol;
3063     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3064     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3065     b->icol       = isicol;
3066     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3067     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3068   } else { /* general case perform the symbolic factorization */
3069     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3070     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3071 
3072     /* get new row pointers */
3073     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
3074     ainew[0] = 0;
3075     /* don't know how many column pointers are needed so estimate */
3076     jmax = (int)(f*ai[n] + 1);
3077     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
3078     /* ajfill is level of fill for each fill entry */
3079     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
3080     /* fill is a linked list of nonzeros in active row */
3081     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
3082     /* im is level for each filled value */
3083     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
3084     /* dloc is location of diagonal in factor */
3085     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
3086     dloc[0]  = 0;
3087     for (prow=0; prow<n; prow++) {
3088 
3089       /* copy prow into linked list */
3090       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3091       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3092       xi         = aj + ai[r[prow]];
3093       fill[n]    = n;
3094       fill[prow] = -1; /* marker for diagonal entry */
3095       while (nz--) {
3096 	fm  = n;
3097 	idx = ic[*xi++];
3098 	do {
3099 	  m  = fm;
3100 	  fm = fill[m];
3101 	} while (fm < idx);
3102 	fill[m]   = idx;
3103 	fill[idx] = fm;
3104 	im[idx]   = 0;
3105       }
3106 
3107       /* make sure diagonal entry is included */
3108       if (diagonal_fill && fill[prow] == -1) {
3109 	fm = n;
3110 	while (fill[fm] < prow) fm = fill[fm];
3111 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3112 	fill[fm]   = prow;
3113 	im[prow]   = 0;
3114 	nzf++;
3115 	dcount++;
3116       }
3117 
3118       nzi = 0;
3119       row = fill[n];
3120       while (row < prow) {
3121 	incrlev = im[row] + 1;
3122 	nz      = dloc[row];
3123 	xi      = ajnew  + ainew[row] + nz + 1;
3124 	flev    = ajfill + ainew[row] + nz + 1;
3125 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3126 	fm      = row;
3127 	while (nnz-- > 0) {
3128 	  idx = *xi++;
3129 	  if (*flev + incrlev > levels) {
3130 	    flev++;
3131 	    continue;
3132 	  }
3133 	  do {
3134 	    m  = fm;
3135 	    fm = fill[m];
3136 	  } while (fm < idx);
3137 	  if (fm != idx) {
3138 	    im[idx]   = *flev + incrlev;
3139 	    fill[m]   = idx;
3140 	    fill[idx] = fm;
3141 	    fm        = idx;
3142 	    nzf++;
3143 	  } else {
3144 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3145 	  }
3146 	  flev++;
3147 	}
3148 	row = fill[row];
3149 	nzi++;
3150       }
3151       /* copy new filled row into permanent storage */
3152       ainew[prow+1] = ainew[prow] + nzf;
3153       if (ainew[prow+1] > jmax) {
3154 
3155 	/* estimate how much additional space we will need */
3156 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3157 	/* just double the memory each time */
3158 	int maxadd = jmax;
3159 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3160 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3161 	jmax += maxadd;
3162 
3163 	/* allocate a longer ajnew and ajfill */
3164 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3165 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3166 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3167 	ajnew = xi;
3168 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3169 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3170 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3171 	ajfill = xi;
3172 	reallocate++; /* count how many reallocations are needed */
3173       }
3174       xi          = ajnew + ainew[prow];
3175       flev        = ajfill + ainew[prow];
3176       dloc[prow]  = nzi;
3177       fm          = fill[n];
3178       while (nzf--) {
3179 	*xi++   = fm;
3180 	*flev++ = im[fm];
3181 	fm      = fill[fm];
3182       }
3183       /* make sure row has diagonal entry */
3184       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3185 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3186     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3187       }
3188     }
3189     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3190     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3191     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3192     ierr = PetscFree(fill);CHKERRQ(ierr);
3193     ierr = PetscFree(im);CHKERRQ(ierr);
3194 
3195     {
3196       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3197       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3198       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3199       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3200       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3201       if (diagonal_fill) {
3202 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3203       }
3204     }
3205 
3206     /* put together the new matrix */
3207     ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr);
3208     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3209     ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr);
3210     PetscLogObjectParent(*fact,isicol);
3211     b = (Mat_SeqBAIJ*)(*fact)->data;
3212     ierr = PetscFree(b->imax);CHKERRQ(ierr);
3213     b->singlemalloc = PETSC_FALSE;
3214     len = bs2*ainew[n]*sizeof(MatScalar);
3215     /* the next line frees the default space generated by the Create() */
3216     ierr = PetscFree(b->a);CHKERRQ(ierr);
3217     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
3218     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
3219     b->j          = ajnew;
3220     b->i          = ainew;
3221     for (i=0; i<n; i++) dloc[i] += ainew[i];
3222     b->diag       = dloc;
3223     b->ilen       = 0;
3224     b->imax       = 0;
3225     b->row        = isrow;
3226     b->col        = iscol;
3227     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3228     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3229     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3230     b->icol       = isicol;
3231     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3232     /* In b structure:  Free imax, ilen, old a, old j.
3233        Allocate dloc, solve_work, new a, new j */
3234     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3235     b->maxnz          = b->nz = ainew[n];
3236     (*fact)->factor   = FACTOR_LU;
3237 
3238     (*fact)->info.factor_mallocs    = reallocate;
3239     (*fact)->info.fill_ratio_given  = f;
3240     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3241   }
3242 
3243   if (row_identity && col_identity) {
3244     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
3245   }
3246   PetscFunctionReturn(0);
3247 }
3248 
3249 #undef __FUNCT__
3250 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3251 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3252 {
3253   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3254   /* int i,*AJ=a->j,nz=a->nz; */
3255   PetscFunctionBegin;
3256   /* Undo Column scaling */
3257 /*    while (nz--) { */
3258 /*      AJ[i] = AJ[i]/4; */
3259 /*    } */
3260   /* This should really invoke a push/pop logic, but we don't have that yet. */
3261   A->ops->setunfactored = PETSC_NULL;
3262   PetscFunctionReturn(0);
3263 }
3264 
3265 #undef __FUNCT__
3266 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3267 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3268 {
3269   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3270   int *AJ=a->j,nz=a->nz;
3271   unsigned short *aj=(unsigned short *)AJ;
3272   PetscFunctionBegin;
3273   /* Is this really necessary? */
3274   while (nz--) {
3275     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3276   }
3277   A->ops->setunfactored = PETSC_NULL;
3278   PetscFunctionReturn(0);
3279 }
3280 
3281 #undef __FUNCT__
3282 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3283 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3284 {
3285   /*
3286       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3287       with natural ordering
3288   */
3289   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3290 
3291   PetscFunctionBegin;
3292   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3293   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3294   switch (a->bs) {
3295   case 1:
3296     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3297     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3298     break;
3299   case 2:
3300     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3301     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3302     break;
3303   case 3:
3304     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3305     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3306     break;
3307   case 4:
3308 #if defined(PETSC_USE_MAT_SINGLE)
3309     {
3310       PetscTruth  sse_enabled_local;
3311       int         ierr;
3312       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3313       if (sse_enabled_local) {
3314 #  if defined(PETSC_HAVE_SSE)
3315         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3316         if (n==(unsigned short)n) {
3317           unsigned short *aj=(unsigned short *)AJ;
3318           for (i=0;i<nz;i++) {
3319             aj[i] = (unsigned short)AJ[i];
3320           }
3321           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3322           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3324         } else {
3325         /* Scale the column indices for easier indexing in MatSolve. */
3326 /*            for (i=0;i<nz;i++) { */
3327 /*              AJ[i] = AJ[i]*4; */
3328 /*            } */
3329           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3330           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3331           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3332         }
3333 #  else
3334       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3335         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3336 #  endif
3337       } else {
3338         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3339         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3340       }
3341     }
3342 #else
3343     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3344     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3345 #endif
3346     break;
3347   case 5:
3348     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3349     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3350     break;
3351   case 6:
3352     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3353     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3354     break;
3355   case 7:
3356     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3357     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3358     break;
3359   }
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 #undef __FUNCT__
3364 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3365 int MatSeqBAIJ_UpdateSolvers(Mat A)
3366 {
3367   /*
3368       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3369       with natural ordering
3370   */
3371   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3372   IS          row = a->row, col = a->col;
3373   PetscTruth  row_identity, col_identity;
3374   PetscTruth  use_natural;
3375   int         ierr;
3376 
3377   PetscFunctionBegin;
3378 
3379   use_natural = PETSC_FALSE;
3380   if (row && col) {
3381     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3382     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3383 
3384     if (row_identity && col_identity) {
3385       use_natural = PETSC_TRUE;
3386     }
3387   } else {
3388     use_natural = PETSC_TRUE;
3389   }
3390 
3391   switch (a->bs) {
3392   case 1:
3393     if (use_natural) {
3394       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3395       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3396       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3397       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3398     } else {
3399       A->ops->solve           = MatSolve_SeqBAIJ_1;
3400       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3401     }
3402     break;
3403   case 2:
3404     if (use_natural) {
3405       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3406       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3407       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3408       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3409     } else {
3410       A->ops->solve           = MatSolve_SeqBAIJ_2;
3411       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3412     }
3413     break;
3414   case 3:
3415     if (use_natural) {
3416       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3417       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3418       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3419       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3420     } else {
3421       A->ops->solve           = MatSolve_SeqBAIJ_3;
3422       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3423     }
3424     break;
3425   case 4:
3426     {
3427       PetscTruth sse_enabled_local;
3428       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3429       if (use_natural) {
3430 #if defined(PETSC_USE_MAT_SINGLE)
3431         if (sse_enabled_local) { /* Natural + Single + SSE */
3432 #  if defined(PETSC_HAVE_SSE)
3433           int n=a->mbs;
3434           if (n==(unsigned short)n) {
3435             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3436             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3437           } else {
3438             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3439             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3440           }
3441 #  else
3442           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3443           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3444 #  endif
3445         } else { /* Natural + Single */
3446           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3447           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3448         }
3449 #else
3450         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3451         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3452 #endif
3453         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3454         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3455       } else { /* Arbitrary ordering */
3456 #if defined(PETSC_USE_MAT_SINGLE)
3457         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3458 #  if defined(PETSC_HAVE_SSE)
3459           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3460           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3461 #  else
3462           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3463           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3464 #  endif
3465         } else { /* Arbitrary + Single */
3466           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3467           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3468         }
3469 #else
3470         A->ops->solve           = MatSolve_SeqBAIJ_4;
3471 #endif
3472         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3473       }
3474     }
3475     break;
3476   case 5:
3477     if (use_natural) {
3478       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3479       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3480       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3481       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3482     } else {
3483       A->ops->solve           = MatSolve_SeqBAIJ_5;
3484       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3485     }
3486     break;
3487   case 6:
3488     if (use_natural) {
3489       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3490       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3491       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3492       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3493     } else {
3494       A->ops->solve           = MatSolve_SeqBAIJ_6;
3495       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3496     }
3497     break;
3498   case 7:
3499     if (use_natural) {
3500       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3501       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3502       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3503       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3504     } else {
3505       A->ops->solve           = MatSolve_SeqBAIJ_7;
3506       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3507     }
3508     break;
3509   default:
3510     A->ops->solve             = MatSolve_SeqBAIJ_N;
3511     break;
3512   }
3513   PetscFunctionReturn(0);
3514 }
3515 
3516 #undef __FUNCT__
3517 #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3518 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3519   int ierr;
3520 
3521   PetscFunctionBegin;
3522   ierr = MatSeqBAIJ_UpdateSolvers(A);
3523   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3524     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3525   } else {
3526     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3527   }
3528   PetscFunctionReturn(0);
3529 }
3530 
3531 #undef __FUNCT__
3532 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3533 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3534   int ierr;
3535 
3536   PetscFunctionBegin;
3537   ierr = MatSeqBAIJ_UpdateSolvers(A);
3538   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3539   PetscFunctionReturn(0);
3540 }
3541 
3542 
3543