xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 7fa3a6a09039016628d7d5d5fddb58a44fcdf355)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1179 {
1180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1181   IS             iscol=a->col,isrow=a->row;
1182   PetscErrorCode ierr;
1183   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1184   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1187   PetscScalar    *x,*b,*t;
1188 
1189   PetscFunctionBegin;
1190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192   t  = a->solve_work;
1193 
1194   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1195   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1196 
1197   /* forward solve the lower triangular */
1198   idx    = 7*(*r++);
1199   t[0] = b[idx];   t[1] = b[1+idx];
1200   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201   t[5] = b[5+idx]; t[6] = b[6+idx];
1202 
1203   for (i=1; i<n; i++) {
1204     v     = aa + 49*ai[i];
1205     vi    = aj + ai[i];
1206     nz    = diag[i] - ai[i];
1207     idx   = 7*(*r++);
1208     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213       x4    = t[3+idx];x5 = t[4+idx];
1214       x6    = t[5+idx];x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idx = 7*i;
1225     t[idx]   = s1;t[1+idx] = s2;
1226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227     t[5+idx] = s6;t[6+idx] = s7;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--){
1231     v    = aa + 49*diag[i] + 49;
1232     vi   = aj + diag[i] + 1;
1233     nz   = ai[i+1] - diag[i] - 1;
1234     idt  = 7*i;
1235     s1 = t[idt];  s2 = t[1+idt];
1236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237     s6 = t[5+idt];s7 = t[6+idt];
1238     while (nz--) {
1239       idx   = 7*(*vi++);
1240       x1    = t[idx];   x2 = t[1+idx];
1241       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242       x6    = t[5+idx]; x7 = t[6+idx];
1243       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1250       v += 49;
1251     }
1252     idc = 7*(*c--);
1253     v   = aa + 49*diag[i];
1254     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1268   }
1269 
1270   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1271   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1280 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1281 {
1282   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1283   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1284   PetscErrorCode    ierr;
1285   PetscInt          *diag = a->diag,jdx;
1286   const MatScalar   *aa=a->a,*v;
1287   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1288   const PetscScalar *b;
1289 
1290   PetscFunctionBegin;
1291   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1292   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1293   /* forward solve the lower triangular */
1294   idx    = 0;
1295   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1296   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1297   x[6] = b[6+idx];
1298   for (i=1; i<n; i++) {
1299     v     =  aa + 49*ai[i];
1300     vi    =  aj + ai[i];
1301     nz    =  diag[i] - ai[i];
1302     idx   =  7*i;
1303     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1304     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1305     s7  =  b[6+idx];
1306     while (nz--) {
1307       jdx   = 7*(*vi++);
1308       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1309       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1310       x7    = x[6+jdx];
1311       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1312       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1313       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1314       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1315       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1316       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1317       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1318       v += 49;
1319      }
1320     x[idx]   = s1;
1321     x[1+idx] = s2;
1322     x[2+idx] = s3;
1323     x[3+idx] = s4;
1324     x[4+idx] = s5;
1325     x[5+idx] = s6;
1326     x[6+idx] = s7;
1327   }
1328   /* backward solve the upper triangular */
1329   for (i=n-1; i>=0; i--){
1330     v    = aa + 49*diag[i] + 49;
1331     vi   = aj + diag[i] + 1;
1332     nz   = ai[i+1] - diag[i] - 1;
1333     idt  = 7*i;
1334     s1 = x[idt];   s2 = x[1+idt];
1335     s3 = x[2+idt]; s4 = x[3+idt];
1336     s5 = x[4+idt]; s6 = x[5+idt];
1337     s7 = x[6+idt];
1338     while (nz--) {
1339       idx   = 7*(*vi++);
1340       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1341       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1342       x7    = x[6+idx];
1343       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1344       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1345       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1346       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1347       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1348       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1349       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1350       v += 49;
1351     }
1352     v        = aa + 49*diag[i];
1353     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1354                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1355     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1356                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1357     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1358                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1359     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1360                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1361     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1362                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1363     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1364                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1365     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1366                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1367   }
1368 
1369   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1370   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1371   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 #undef __FUNCT__
1376 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1377 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1378 {
1379     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1380     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1381     PetscErrorCode    ierr;
1382     PetscInt          idx,jdx,idt;
1383     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1384     const MatScalar   *aa=a->a,*v;
1385     PetscScalar       *x;
1386     const PetscScalar *b;
1387     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1388 
1389     PetscFunctionBegin;
1390     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1391     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1392     /* forward solve the lower triangular */
1393     idx    = 0;
1394     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1395     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1396     for (i=1; i<n; i++) {
1397        v    = aa + bs2*ai[i];
1398        vi   = aj + ai[i];
1399        nz   = ai[i+1] - ai[i];
1400       idx   = bs*i;
1401        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1402        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1403        while (nz--) {
1404           jdx   = bs*(*vi++);
1405           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1406 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1407           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1408           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1409           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1410 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1411           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1412 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1413 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1414           v   +=  bs2;
1415         }
1416 
1417        x[idx]   = s1;
1418        x[1+idx] = s2;
1419        x[2+idx] = s3;
1420        x[3+idx] = s4;
1421        x[4+idx] = s5;
1422        x[5+idx] = s6;
1423        x[6+idx] = s7;
1424     }
1425 
1426    /* backward solve the upper triangular */
1427   for (i=n-1; i>=0; i--){
1428      v   = aa + bs2*ai[2*n-i];
1429      vi  = aj + ai[2*n-i];
1430      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1431      idt = bs*i;
1432      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1433      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1434     while (nz--) {
1435       idx   = bs*(*vi++);
1436        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1437        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1438        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1439        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1440        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1441        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1442        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1443        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1444        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1445         v   +=  bs2;
1446     }
1447     /* x = inv_diagonal*x */
1448     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1449     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1450     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1451     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1452     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1453     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1454     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1455   }
1456 
1457   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1459   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 #undef __FUNCT__
1464 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1465 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1466 {
1467   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1468   IS                iscol=a->col,isrow=a->row;
1469   PetscErrorCode    ierr;
1470   const PetscInt    *r,*c,*rout,*cout;
1471   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1472   const MatScalar   *aa=a->a,*v;
1473   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1474   const PetscScalar *b;
1475   PetscFunctionBegin;
1476   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1478   t  = a->solve_work;
1479 
1480   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1481   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1482 
1483   /* forward solve the lower triangular */
1484   idx    = 6*(*r++);
1485   t[0] = b[idx];   t[1] = b[1+idx];
1486   t[2] = b[2+idx]; t[3] = b[3+idx];
1487   t[4] = b[4+idx]; t[5] = b[5+idx];
1488   for (i=1; i<n; i++) {
1489     v     = aa + 36*ai[i];
1490     vi    = aj + ai[i];
1491     nz    = diag[i] - ai[i];
1492     idx   = 6*(*r++);
1493     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1494     s5  = b[4+idx]; s6 = b[5+idx];
1495     while (nz--) {
1496       idx   = 6*(*vi++);
1497       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1498       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1499       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1500       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1501       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1502       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1503       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1504       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1505       v += 36;
1506     }
1507     idx = 6*i;
1508     t[idx]   = s1;t[1+idx] = s2;
1509     t[2+idx] = s3;t[3+idx] = s4;
1510     t[4+idx] = s5;t[5+idx] = s6;
1511   }
1512   /* backward solve the upper triangular */
1513   for (i=n-1; i>=0; i--){
1514     v    = aa + 36*diag[i] + 36;
1515     vi   = aj + diag[i] + 1;
1516     nz   = ai[i+1] - diag[i] - 1;
1517     idt  = 6*i;
1518     s1 = t[idt];  s2 = t[1+idt];
1519     s3 = t[2+idt];s4 = t[3+idt];
1520     s5 = t[4+idt];s6 = t[5+idt];
1521     while (nz--) {
1522       idx   = 6*(*vi++);
1523       x1    = t[idx];   x2 = t[1+idx];
1524       x3    = t[2+idx]; x4 = t[3+idx];
1525       x5    = t[4+idx]; x6 = t[5+idx];
1526       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1532       v += 36;
1533     }
1534     idc = 6*(*c--);
1535     v   = aa + 36*diag[i];
1536     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1537                                  v[18]*s4+v[24]*s5+v[30]*s6;
1538     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1539                                  v[19]*s4+v[25]*s5+v[31]*s6;
1540     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1541                                  v[20]*s4+v[26]*s5+v[32]*s6;
1542     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1543                                  v[21]*s4+v[27]*s5+v[33]*s6;
1544     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1545                                  v[22]*s4+v[28]*s5+v[34]*s6;
1546     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1547                                  v[23]*s4+v[29]*s5+v[35]*s6;
1548   }
1549 
1550   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1551   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1552   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1553   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1554   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 #undef __FUNCT__
1559 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1560 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1561 {
1562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1563   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1564   PetscErrorCode    ierr;
1565   PetscInt          *diag = a->diag,jdx;
1566   const MatScalar   *aa=a->a,*v;
1567   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1568   const PetscScalar *b;
1569 
1570   PetscFunctionBegin;
1571   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1572   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1573   /* forward solve the lower triangular */
1574   idx    = 0;
1575   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1576   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1577   for (i=1; i<n; i++) {
1578     v     =  aa + 36*ai[i];
1579     vi    =  aj + ai[i];
1580     nz    =  diag[i] - ai[i];
1581     idx   =  6*i;
1582     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1583     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1584     while (nz--) {
1585       jdx   = 6*(*vi++);
1586       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1587       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1588       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1589       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1590       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1591       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1592       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1593       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1594       v += 36;
1595      }
1596     x[idx]   = s1;
1597     x[1+idx] = s2;
1598     x[2+idx] = s3;
1599     x[3+idx] = s4;
1600     x[4+idx] = s5;
1601     x[5+idx] = s6;
1602   }
1603   /* backward solve the upper triangular */
1604   for (i=n-1; i>=0; i--){
1605     v    = aa + 36*diag[i] + 36;
1606     vi   = aj + diag[i] + 1;
1607     nz   = ai[i+1] - diag[i] - 1;
1608     idt  = 6*i;
1609     s1 = x[idt];   s2 = x[1+idt];
1610     s3 = x[2+idt]; s4 = x[3+idt];
1611     s5 = x[4+idt]; s6 = x[5+idt];
1612     while (nz--) {
1613       idx   = 6*(*vi++);
1614       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1615       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1616       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1617       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1618       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1619       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1620       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1621       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1622       v += 36;
1623     }
1624     v        = aa + 36*diag[i];
1625     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1626     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1627     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1628     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1629     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1630     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1631   }
1632 
1633   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1634   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1635   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1636   PetscFunctionReturn(0);
1637 }
1638 
1639 #undef __FUNCT__
1640 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1641 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1642 {
1643     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1644     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1645     PetscErrorCode    ierr;
1646     PetscInt          idx,jdx,idt;
1647     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1648     const MatScalar   *aa=a->a,*v;
1649     PetscScalar       *x;
1650     const PetscScalar *b;
1651     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1652 
1653     PetscFunctionBegin;
1654     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1655     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1656     /* forward solve the lower triangular */
1657     idx    = 0;
1658     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1659     x[4] = b[4+idx];x[5] = b[5+idx];
1660     for (i=1; i<n; i++) {
1661        v    = aa + bs2*ai[i];
1662        vi   = aj + ai[i];
1663        nz   = ai[i+1] - ai[i];
1664       idx   = bs*i;
1665        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1666        s5   = b[4+idx];s6 = b[5+idx];
1667        while (nz--) {
1668           jdx   = bs*(*vi++);
1669           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1670 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1671           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1672           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1673           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1674 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1675           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1676 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1677           v   +=  bs2;
1678         }
1679 
1680        x[idx]   = s1;
1681        x[1+idx] = s2;
1682        x[2+idx] = s3;
1683        x[3+idx] = s4;
1684        x[4+idx] = s5;
1685        x[5+idx] = s6;
1686     }
1687 
1688    /* backward solve the upper triangular */
1689   for (i=n-1; i>=0; i--){
1690      v   = aa + bs2*ai[2*n-i];
1691      vi  = aj + ai[2*n-i];
1692      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1693      idt = bs*i;
1694      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1695      s5 = x[4+idt];s6 = x[5+idt];
1696     while (nz--) {
1697       idx   = bs*(*vi++);
1698        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1699        x5    = x[4+idx];x6 = x[5+idx];
1700        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1701        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1702        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1703        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1704        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1705        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1706         v   +=  bs2;
1707     }
1708     /* x = inv_diagonal*x */
1709    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1710    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1711    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1712    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1713    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1714    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1715   }
1716 
1717   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1718   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1719   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1720   PetscFunctionReturn(0);
1721 }
1722 #undef __FUNCT__
1723 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1724 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1725 {
1726   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1727   IS                iscol=a->col,isrow=a->row;
1728   PetscErrorCode    ierr;
1729   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1730   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1731   const MatScalar   *aa=a->a,*v;
1732   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1733   const PetscScalar *b;
1734 
1735   PetscFunctionBegin;
1736   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1737   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1738   t  = a->solve_work;
1739 
1740   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1741   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1742 
1743   /* forward solve the lower triangular */
1744   idx    = 5*(*r++);
1745   t[0] = b[idx];   t[1] = b[1+idx];
1746   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1747   for (i=1; i<n; i++) {
1748     v     = aa + 25*ai[i];
1749     vi    = aj + ai[i];
1750     nz    = diag[i] - ai[i];
1751     idx   = 5*(*r++);
1752     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1753     s5  = b[4+idx];
1754     while (nz--) {
1755       idx   = 5*(*vi++);
1756       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1757       x4    = t[3+idx];x5 = t[4+idx];
1758       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1759       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1760       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1761       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1762       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1763       v += 25;
1764     }
1765     idx = 5*i;
1766     t[idx]   = s1;t[1+idx] = s2;
1767     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1768   }
1769   /* backward solve the upper triangular */
1770   for (i=n-1; i>=0; i--){
1771     v    = aa + 25*diag[i] + 25;
1772     vi   = aj + diag[i] + 1;
1773     nz   = ai[i+1] - diag[i] - 1;
1774     idt  = 5*i;
1775     s1 = t[idt];  s2 = t[1+idt];
1776     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1777     while (nz--) {
1778       idx   = 5*(*vi++);
1779       x1    = t[idx];   x2 = t[1+idx];
1780       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1781       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1782       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1783       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1784       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1785       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1786       v += 25;
1787     }
1788     idc = 5*(*c--);
1789     v   = aa + 25*diag[i];
1790     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1791                                  v[15]*s4+v[20]*s5;
1792     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1793                                  v[16]*s4+v[21]*s5;
1794     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1795                                  v[17]*s4+v[22]*s5;
1796     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1797                                  v[18]*s4+v[23]*s5;
1798     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1799                                  v[19]*s4+v[24]*s5;
1800   }
1801 
1802   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1803   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1804   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1805   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1806   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 #undef __FUNCT__
1811 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1812 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1813 {
1814   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1815   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1816   PetscErrorCode    ierr;
1817   PetscInt          *diag = a->diag,jdx;
1818   const MatScalar   *aa=a->a,*v;
1819   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1820   const PetscScalar *b;
1821 
1822   PetscFunctionBegin;
1823   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1824   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1825   /* forward solve the lower triangular */
1826   idx    = 0;
1827   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1828   for (i=1; i<n; i++) {
1829     v     =  aa + 25*ai[i];
1830     vi    =  aj + ai[i];
1831     nz    =  diag[i] - ai[i];
1832     idx   =  5*i;
1833     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1834     while (nz--) {
1835       jdx   = 5*(*vi++);
1836       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1837       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1838       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1839       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1840       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1841       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1842       v    += 25;
1843     }
1844     x[idx]   = s1;
1845     x[1+idx] = s2;
1846     x[2+idx] = s3;
1847     x[3+idx] = s4;
1848     x[4+idx] = s5;
1849   }
1850   /* backward solve the upper triangular */
1851   for (i=n-1; i>=0; i--){
1852     v    = aa + 25*diag[i] + 25;
1853     vi   = aj + diag[i] + 1;
1854     nz   = ai[i+1] - diag[i] - 1;
1855     idt  = 5*i;
1856     s1 = x[idt];  s2 = x[1+idt];
1857     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1858     while (nz--) {
1859       idx   = 5*(*vi++);
1860       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1861       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1862       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1863       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1864       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1865       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1866       v    += 25;
1867     }
1868     v        = aa + 25*diag[i];
1869     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1870     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1871     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1872     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1873     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1874   }
1875 
1876   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1878   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 #undef __FUNCT__
1883 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
1884 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1885 {
1886   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1887   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1888   PetscErrorCode    ierr;
1889   PetscInt          jdx;
1890   const MatScalar   *aa=a->a,*v;
1891   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1892   const PetscScalar *b;
1893 
1894   PetscFunctionBegin;
1895   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1897   /* forward solve the lower triangular */
1898   idx    = 0;
1899   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1900   for (i=1; i<n; i++) {
1901     v   = aa + 25*ai[i];
1902     vi  = aj + ai[i];
1903     nz  = ai[i+1] - ai[i];
1904     idx = 5*i;
1905     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1906     while (nz--) {
1907       jdx   = 5*(*vi++);
1908       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1909       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1910       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1911       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1912       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1913       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1914       v    += 25;
1915     }
1916     x[idx]   = s1;
1917     x[1+idx] = s2;
1918     x[2+idx] = s3;
1919     x[3+idx] = s4;
1920     x[4+idx] = s5;
1921   }
1922 
1923   /* backward solve the upper triangular */
1924   for (i=n-1; i>=0; i--){
1925     v   = aa + 25*ai[2*n-i];
1926     vi  = aj + ai[2*n-i];
1927     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1928     idt = 5*i;
1929     s1 = x[idt];  s2 = x[1+idt];
1930     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1931     while (nz--) {
1932       idx   = 5*(*vi++);
1933       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1934       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1935       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1936       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1937       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1938       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1939       v    += 25;
1940     }
1941     /* x = inv_diagonal*x */
1942     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1943     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1944     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1945     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1946     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1947   }
1948 
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 #undef __FUNCT__
1956 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1957 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1958 {
1959   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1960   IS                iscol=a->col,isrow=a->row;
1961   PetscErrorCode    ierr;
1962   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1963   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1964   const MatScalar   *aa=a->a,*v;
1965   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1966   const PetscScalar *b;
1967 
1968   PetscFunctionBegin;
1969   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1971   t  = a->solve_work;
1972 
1973   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1974   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1975 
1976   /* forward solve the lower triangular */
1977   idx    = 4*(*r++);
1978   t[0] = b[idx];   t[1] = b[1+idx];
1979   t[2] = b[2+idx]; t[3] = b[3+idx];
1980   for (i=1; i<n; i++) {
1981     v     = aa + 16*ai[i];
1982     vi    = aj + ai[i];
1983     nz    = diag[i] - ai[i];
1984     idx   = 4*(*r++);
1985     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1986     while (nz--) {
1987       idx   = 4*(*vi++);
1988       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1989       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1990       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1991       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1992       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1993       v    += 16;
1994     }
1995     idx        = 4*i;
1996     t[idx]   = s1;t[1+idx] = s2;
1997     t[2+idx] = s3;t[3+idx] = s4;
1998   }
1999   /* backward solve the upper triangular */
2000   for (i=n-1; i>=0; i--){
2001     v    = aa + 16*diag[i] + 16;
2002     vi   = aj + diag[i] + 1;
2003     nz   = ai[i+1] - diag[i] - 1;
2004     idt  = 4*i;
2005     s1 = t[idt];  s2 = t[1+idt];
2006     s3 = t[2+idt];s4 = t[3+idt];
2007     while (nz--) {
2008       idx   = 4*(*vi++);
2009       x1    = t[idx];   x2 = t[1+idx];
2010       x3    = t[2+idx]; x4 = t[3+idx];
2011       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2012       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2013       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2014       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2015       v += 16;
2016     }
2017     idc      = 4*(*c--);
2018     v        = aa + 16*diag[i];
2019     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2020     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2021     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2022     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2023   }
2024 
2025   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2026   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2027   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2028   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2029   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2030   PetscFunctionReturn(0);
2031 }
2032 
2033 #undef __FUNCT__
2034 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2035 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2036 {
2037   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2038   IS                iscol=a->col,isrow=a->row;
2039   PetscErrorCode    ierr;
2040   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2041   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2042   const MatScalar   *aa=a->a,*v;
2043   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2044   PetscScalar       *x;
2045   const PetscScalar *b;
2046 
2047   PetscFunctionBegin;
2048   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2049   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2050   t  = (MatScalar *)a->solve_work;
2051 
2052   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2053   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2054 
2055   /* forward solve the lower triangular */
2056   idx    = 4*(*r++);
2057   t[0] = (MatScalar)b[idx];
2058   t[1] = (MatScalar)b[1+idx];
2059   t[2] = (MatScalar)b[2+idx];
2060   t[3] = (MatScalar)b[3+idx];
2061   for (i=1; i<n; i++) {
2062     v     = aa + 16*ai[i];
2063     vi    = aj + ai[i];
2064     nz    = diag[i] - ai[i];
2065     idx   = 4*(*r++);
2066     s1 = (MatScalar)b[idx];
2067     s2 = (MatScalar)b[1+idx];
2068     s3 = (MatScalar)b[2+idx];
2069     s4 = (MatScalar)b[3+idx];
2070     while (nz--) {
2071       idx   = 4*(*vi++);
2072       x1  = t[idx];
2073       x2  = t[1+idx];
2074       x3  = t[2+idx];
2075       x4  = t[3+idx];
2076       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2077       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2078       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2079       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2080       v    += 16;
2081     }
2082     idx        = 4*i;
2083     t[idx]   = s1;
2084     t[1+idx] = s2;
2085     t[2+idx] = s3;
2086     t[3+idx] = s4;
2087   }
2088   /* backward solve the upper triangular */
2089   for (i=n-1; i>=0; i--){
2090     v    = aa + 16*diag[i] + 16;
2091     vi   = aj + diag[i] + 1;
2092     nz   = ai[i+1] - diag[i] - 1;
2093     idt  = 4*i;
2094     s1 = t[idt];
2095     s2 = t[1+idt];
2096     s3 = t[2+idt];
2097     s4 = t[3+idt];
2098     while (nz--) {
2099       idx   = 4*(*vi++);
2100       x1  = t[idx];
2101       x2  = t[1+idx];
2102       x3  = t[2+idx];
2103       x4  = t[3+idx];
2104       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2105       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2106       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2107       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2108       v += 16;
2109     }
2110     idc      = 4*(*c--);
2111     v        = aa + 16*diag[i];
2112     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2113     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2114     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2115     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2116     x[idc]   = (PetscScalar)t[idt];
2117     x[1+idc] = (PetscScalar)t[1+idt];
2118     x[2+idc] = (PetscScalar)t[2+idt];
2119     x[3+idc] = (PetscScalar)t[3+idt];
2120  }
2121 
2122   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2123   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2124   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2125   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2126   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2127   PetscFunctionReturn(0);
2128 }
2129 
2130 #if defined (PETSC_HAVE_SSE)
2131 
2132 #include PETSC_HAVE_SSE
2133 
2134 #undef __FUNCT__
2135 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2136 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2137 {
2138   /*
2139      Note: This code uses demotion of double
2140      to float when performing the mixed-mode computation.
2141      This may not be numerically reasonable for all applications.
2142   */
2143   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2144   IS             iscol=a->col,isrow=a->row;
2145   PetscErrorCode ierr;
2146   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2147   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2148   MatScalar      *aa=a->a,*v;
2149   PetscScalar    *x,*b,*t;
2150 
2151   /* Make space in temp stack for 16 Byte Aligned arrays */
2152   float           ssealignedspace[11],*tmps,*tmpx;
2153   unsigned long   offset;
2154 
2155   PetscFunctionBegin;
2156   SSE_SCOPE_BEGIN;
2157 
2158     offset = (unsigned long)ssealignedspace % 16;
2159     if (offset) offset = (16 - offset)/4;
2160     tmps = &ssealignedspace[offset];
2161     tmpx = &ssealignedspace[offset+4];
2162     PREFETCH_NTA(aa+16*ai[1]);
2163 
2164     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2165     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2166     t  = a->solve_work;
2167 
2168     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2169     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2170 
2171     /* forward solve the lower triangular */
2172     idx  = 4*(*r++);
2173     t[0] = b[idx];   t[1] = b[1+idx];
2174     t[2] = b[2+idx]; t[3] = b[3+idx];
2175     v    =  aa + 16*ai[1];
2176 
2177     for (i=1; i<n;) {
2178       PREFETCH_NTA(&v[8]);
2179       vi   =  aj      + ai[i];
2180       nz   =  diag[i] - ai[i];
2181       idx  =  4*(*r++);
2182 
2183       /* Demote sum from double to float */
2184       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2185       LOAD_PS(tmps,XMM7);
2186 
2187       while (nz--) {
2188         PREFETCH_NTA(&v[16]);
2189         idx = 4*(*vi++);
2190 
2191         /* Demote solution (so far) from double to float */
2192         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2193 
2194         /* 4x4 Matrix-Vector product with negative accumulation: */
2195         SSE_INLINE_BEGIN_2(tmpx,v)
2196           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2197 
2198           /* First Column */
2199           SSE_COPY_PS(XMM0,XMM6)
2200           SSE_SHUFFLE(XMM0,XMM0,0x00)
2201           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2202           SSE_SUB_PS(XMM7,XMM0)
2203 
2204           /* Second Column */
2205           SSE_COPY_PS(XMM1,XMM6)
2206           SSE_SHUFFLE(XMM1,XMM1,0x55)
2207           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2208           SSE_SUB_PS(XMM7,XMM1)
2209 
2210           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2211 
2212           /* Third Column */
2213           SSE_COPY_PS(XMM2,XMM6)
2214           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2215           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2216           SSE_SUB_PS(XMM7,XMM2)
2217 
2218           /* Fourth Column */
2219           SSE_COPY_PS(XMM3,XMM6)
2220           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2221           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2222           SSE_SUB_PS(XMM7,XMM3)
2223         SSE_INLINE_END_2
2224 
2225         v  += 16;
2226       }
2227       idx = 4*i;
2228       v   = aa + 16*ai[++i];
2229       PREFETCH_NTA(v);
2230       STORE_PS(tmps,XMM7);
2231 
2232       /* Promote result from float to double */
2233       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2234     }
2235     /* backward solve the upper triangular */
2236     idt  = 4*(n-1);
2237     ai16 = 16*diag[n-1];
2238     v    = aa + ai16 + 16;
2239     for (i=n-1; i>=0;){
2240       PREFETCH_NTA(&v[8]);
2241       vi = aj + diag[i] + 1;
2242       nz = ai[i+1] - diag[i] - 1;
2243 
2244       /* Demote accumulator from double to float */
2245       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2246       LOAD_PS(tmps,XMM7);
2247 
2248       while (nz--) {
2249         PREFETCH_NTA(&v[16]);
2250         idx = 4*(*vi++);
2251 
2252         /* Demote solution (so far) from double to float */
2253         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2254 
2255         /* 4x4 Matrix-Vector Product with negative accumulation: */
2256         SSE_INLINE_BEGIN_2(tmpx,v)
2257           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2258 
2259           /* First Column */
2260           SSE_COPY_PS(XMM0,XMM6)
2261           SSE_SHUFFLE(XMM0,XMM0,0x00)
2262           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2263           SSE_SUB_PS(XMM7,XMM0)
2264 
2265           /* Second Column */
2266           SSE_COPY_PS(XMM1,XMM6)
2267           SSE_SHUFFLE(XMM1,XMM1,0x55)
2268           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2269           SSE_SUB_PS(XMM7,XMM1)
2270 
2271           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2272 
2273           /* Third Column */
2274           SSE_COPY_PS(XMM2,XMM6)
2275           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2276           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2277           SSE_SUB_PS(XMM7,XMM2)
2278 
2279           /* Fourth Column */
2280           SSE_COPY_PS(XMM3,XMM6)
2281           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2282           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2283           SSE_SUB_PS(XMM7,XMM3)
2284         SSE_INLINE_END_2
2285         v  += 16;
2286       }
2287       v    = aa + ai16;
2288       ai16 = 16*diag[--i];
2289       PREFETCH_NTA(aa+ai16+16);
2290       /*
2291          Scale the result by the diagonal 4x4 block,
2292          which was inverted as part of the factorization
2293       */
2294       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2295         /* First Column */
2296         SSE_COPY_PS(XMM0,XMM7)
2297         SSE_SHUFFLE(XMM0,XMM0,0x00)
2298         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2299 
2300         /* Second Column */
2301         SSE_COPY_PS(XMM1,XMM7)
2302         SSE_SHUFFLE(XMM1,XMM1,0x55)
2303         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2304         SSE_ADD_PS(XMM0,XMM1)
2305 
2306         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2307 
2308         /* Third Column */
2309         SSE_COPY_PS(XMM2,XMM7)
2310         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2311         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2312         SSE_ADD_PS(XMM0,XMM2)
2313 
2314         /* Fourth Column */
2315         SSE_COPY_PS(XMM3,XMM7)
2316         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2317         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2318         SSE_ADD_PS(XMM0,XMM3)
2319 
2320         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2321       SSE_INLINE_END_3
2322 
2323       /* Promote solution from float to double */
2324       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2325 
2326       /* Apply reordering to t and stream into x.    */
2327       /* This way, x doesn't pollute the cache.      */
2328       /* Be careful with size: 2 doubles = 4 floats! */
2329       idc  = 4*(*c--);
2330       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2331         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2332         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2333         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2334         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2335         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2336         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2337       SSE_INLINE_END_2
2338       v    = aa + ai16 + 16;
2339       idt -= 4;
2340     }
2341 
2342     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2343     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2344     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2345     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2346     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2347   SSE_SCOPE_END;
2348   PetscFunctionReturn(0);
2349 }
2350 
2351 #endif
2352 
2353 
2354 /*
2355       Special case where the matrix was ILU(0) factored in the natural
2356    ordering. This eliminates the need for the column and row permutation.
2357 */
2358 #undef __FUNCT__
2359 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2360 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2361 {
2362   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2363   PetscInt          n=a->mbs;
2364   const PetscInt    *ai=a->i,*aj=a->j;
2365   PetscErrorCode    ierr;
2366   const PetscInt    *diag = a->diag;
2367   const MatScalar   *aa=a->a;
2368   PetscScalar       *x;
2369   const PetscScalar *b;
2370 
2371   PetscFunctionBegin;
2372   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2373   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2374 
2375 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2376   {
2377     static PetscScalar w[2000]; /* very BAD need to fix */
2378     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2379   }
2380 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2381   {
2382     static PetscScalar w[2000]; /* very BAD need to fix */
2383     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2384   }
2385 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2386   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2387 #else
2388   {
2389     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2390     const MatScalar *v;
2391     PetscInt        jdx,idt,idx,nz,i,ai16;
2392     const PetscInt  *vi;
2393 
2394   /* forward solve the lower triangular */
2395   idx    = 0;
2396   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2397   for (i=1; i<n; i++) {
2398     v     =  aa      + 16*ai[i];
2399     vi    =  aj      + ai[i];
2400     nz    =  diag[i] - ai[i];
2401     idx   +=  4;
2402     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2403     while (nz--) {
2404       jdx   = 4*(*vi++);
2405       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2406       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2407       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2408       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2409       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2410       v    += 16;
2411     }
2412     x[idx]   = s1;
2413     x[1+idx] = s2;
2414     x[2+idx] = s3;
2415     x[3+idx] = s4;
2416   }
2417   /* backward solve the upper triangular */
2418   idt = 4*(n-1);
2419   for (i=n-1; i>=0; i--){
2420     ai16 = 16*diag[i];
2421     v    = aa + ai16 + 16;
2422     vi   = aj + diag[i] + 1;
2423     nz   = ai[i+1] - diag[i] - 1;
2424     s1 = x[idt];  s2 = x[1+idt];
2425     s3 = x[2+idt];s4 = x[3+idt];
2426     while (nz--) {
2427       idx   = 4*(*vi++);
2428       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2429       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2430       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2431       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2432       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2433       v    += 16;
2434     }
2435     v        = aa + ai16;
2436     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2437     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2438     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2439     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2440     idt -= 4;
2441   }
2442   }
2443 #endif
2444 
2445   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2446   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2447   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2448   PetscFunctionReturn(0);
2449 }
2450 
2451 #undef __FUNCT__
2452 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2453 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2454 {
2455     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2456     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2457     PetscErrorCode    ierr;
2458     PetscInt          idx,jdx,idt;
2459     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2460     const MatScalar   *aa=a->a,*v;
2461     PetscScalar       *x;
2462     const PetscScalar *b;
2463     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2464 
2465     PetscFunctionBegin;
2466     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2467     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2468     /* forward solve the lower triangular */
2469     idx    = 0;
2470     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2471     for (i=1; i<n; i++) {
2472        v    = aa + bs2*ai[i];
2473        vi   = aj + ai[i];
2474        nz   = ai[i+1] - ai[i];
2475       idx   = bs*i;
2476        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2477        while (nz--) {
2478           jdx   = bs*(*vi++);
2479           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2480           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2481           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2482           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2483 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2484 
2485           v   +=  bs2;
2486         }
2487 
2488        x[idx]   = s1;
2489        x[1+idx] = s2;
2490        x[2+idx] = s3;
2491        x[3+idx] = s4;
2492     }
2493 
2494    /* backward solve the upper triangular */
2495   for (i=n-1; i>=0; i--){
2496      v   = aa + bs2*ai[2*n-i];
2497      vi  = aj + ai[2*n-i];
2498      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2499      idt = bs*i;
2500      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2501 
2502     while (nz--) {
2503       idx   = bs*(*vi++);
2504        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2505        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2506        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2507        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2508        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2509 
2510         v   +=  bs2;
2511     }
2512     /* x = inv_diagonal*x */
2513    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2514    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2515    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2516    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2517 
2518   }
2519 
2520   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2521   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2522   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2523   PetscFunctionReturn(0);
2524 }
2525 
2526 
2527 
2528 #undef __FUNCT__
2529 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2530 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2531 {
2532   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2533   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2534   PetscErrorCode ierr;
2535   PetscInt       *diag = a->diag;
2536   MatScalar      *aa=a->a;
2537   PetscScalar    *x,*b;
2538 
2539   PetscFunctionBegin;
2540   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2542 
2543   {
2544     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2545     MatScalar  *v,*t=(MatScalar *)x;
2546     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2547 
2548     /* forward solve the lower triangular */
2549     idx  = 0;
2550     t[0] = (MatScalar)b[0];
2551     t[1] = (MatScalar)b[1];
2552     t[2] = (MatScalar)b[2];
2553     t[3] = (MatScalar)b[3];
2554     for (i=1; i<n; i++) {
2555       v     =  aa      + 16*ai[i];
2556       vi    =  aj      + ai[i];
2557       nz    =  diag[i] - ai[i];
2558       idx   +=  4;
2559       s1 = (MatScalar)b[idx];
2560       s2 = (MatScalar)b[1+idx];
2561       s3 = (MatScalar)b[2+idx];
2562       s4 = (MatScalar)b[3+idx];
2563       while (nz--) {
2564         jdx = 4*(*vi++);
2565         x1  = t[jdx];
2566         x2  = t[1+jdx];
2567         x3  = t[2+jdx];
2568         x4  = t[3+jdx];
2569         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2570         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2571         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2572         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2573         v    += 16;
2574       }
2575       t[idx]   = s1;
2576       t[1+idx] = s2;
2577       t[2+idx] = s3;
2578       t[3+idx] = s4;
2579     }
2580     /* backward solve the upper triangular */
2581     idt = 4*(n-1);
2582     for (i=n-1; i>=0; i--){
2583       ai16 = 16*diag[i];
2584       v    = aa + ai16 + 16;
2585       vi   = aj + diag[i] + 1;
2586       nz   = ai[i+1] - diag[i] - 1;
2587       s1   = t[idt];
2588       s2   = t[1+idt];
2589       s3   = t[2+idt];
2590       s4   = t[3+idt];
2591       while (nz--) {
2592         idx = 4*(*vi++);
2593         x1  = (MatScalar)x[idx];
2594         x2  = (MatScalar)x[1+idx];
2595         x3  = (MatScalar)x[2+idx];
2596         x4  = (MatScalar)x[3+idx];
2597         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2598         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2599         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2600         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2601         v    += 16;
2602       }
2603       v        = aa + ai16;
2604       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2605       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2606       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2607       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2608       idt -= 4;
2609     }
2610   }
2611 
2612   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2613   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2614   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2615   PetscFunctionReturn(0);
2616 }
2617 
2618 #if defined (PETSC_HAVE_SSE)
2619 
2620 #include PETSC_HAVE_SSE
2621 #undef __FUNCT__
2622 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2623 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2624 {
2625   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2626   unsigned short *aj=(unsigned short *)a->j;
2627   PetscErrorCode ierr;
2628   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2629   MatScalar      *aa=a->a;
2630   PetscScalar    *x,*b;
2631 
2632   PetscFunctionBegin;
2633   SSE_SCOPE_BEGIN;
2634   /*
2635      Note: This code currently uses demotion of double
2636      to float when performing the mixed-mode computation.
2637      This may not be numerically reasonable for all applications.
2638   */
2639   PREFETCH_NTA(aa+16*ai[1]);
2640 
2641   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2642   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2643   {
2644     /* x will first be computed in single precision then promoted inplace to double */
2645     MatScalar      *v,*t=(MatScalar *)x;
2646     int            nz,i,idt,ai16;
2647     unsigned int   jdx,idx;
2648     unsigned short *vi;
2649     /* Forward solve the lower triangular factor. */
2650 
2651     /* First block is the identity. */
2652     idx  = 0;
2653     CONVERT_DOUBLE4_FLOAT4(t,b);
2654     v    =  aa + 16*((unsigned int)ai[1]);
2655 
2656     for (i=1; i<n;) {
2657       PREFETCH_NTA(&v[8]);
2658       vi   =  aj      + ai[i];
2659       nz   =  diag[i] - ai[i];
2660       idx +=  4;
2661 
2662       /* Demote RHS from double to float. */
2663       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2664       LOAD_PS(&t[idx],XMM7);
2665 
2666       while (nz--) {
2667         PREFETCH_NTA(&v[16]);
2668         jdx = 4*((unsigned int)(*vi++));
2669 
2670         /* 4x4 Matrix-Vector product with negative accumulation: */
2671         SSE_INLINE_BEGIN_2(&t[jdx],v)
2672           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2673 
2674           /* First Column */
2675           SSE_COPY_PS(XMM0,XMM6)
2676           SSE_SHUFFLE(XMM0,XMM0,0x00)
2677           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2678           SSE_SUB_PS(XMM7,XMM0)
2679 
2680           /* Second Column */
2681           SSE_COPY_PS(XMM1,XMM6)
2682           SSE_SHUFFLE(XMM1,XMM1,0x55)
2683           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2684           SSE_SUB_PS(XMM7,XMM1)
2685 
2686           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2687 
2688           /* Third Column */
2689           SSE_COPY_PS(XMM2,XMM6)
2690           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2691           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2692           SSE_SUB_PS(XMM7,XMM2)
2693 
2694           /* Fourth Column */
2695           SSE_COPY_PS(XMM3,XMM6)
2696           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2697           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2698           SSE_SUB_PS(XMM7,XMM3)
2699         SSE_INLINE_END_2
2700 
2701         v  += 16;
2702       }
2703       v    =  aa + 16*ai[++i];
2704       PREFETCH_NTA(v);
2705       STORE_PS(&t[idx],XMM7);
2706     }
2707 
2708     /* Backward solve the upper triangular factor.*/
2709 
2710     idt  = 4*(n-1);
2711     ai16 = 16*diag[n-1];
2712     v    = aa + ai16 + 16;
2713     for (i=n-1; i>=0;){
2714       PREFETCH_NTA(&v[8]);
2715       vi = aj + diag[i] + 1;
2716       nz = ai[i+1] - diag[i] - 1;
2717 
2718       LOAD_PS(&t[idt],XMM7);
2719 
2720       while (nz--) {
2721         PREFETCH_NTA(&v[16]);
2722         idx = 4*((unsigned int)(*vi++));
2723 
2724         /* 4x4 Matrix-Vector Product with negative accumulation: */
2725         SSE_INLINE_BEGIN_2(&t[idx],v)
2726           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2727 
2728           /* First Column */
2729           SSE_COPY_PS(XMM0,XMM6)
2730           SSE_SHUFFLE(XMM0,XMM0,0x00)
2731           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2732           SSE_SUB_PS(XMM7,XMM0)
2733 
2734           /* Second Column */
2735           SSE_COPY_PS(XMM1,XMM6)
2736           SSE_SHUFFLE(XMM1,XMM1,0x55)
2737           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2738           SSE_SUB_PS(XMM7,XMM1)
2739 
2740           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2741 
2742           /* Third Column */
2743           SSE_COPY_PS(XMM2,XMM6)
2744           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2745           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2746           SSE_SUB_PS(XMM7,XMM2)
2747 
2748           /* Fourth Column */
2749           SSE_COPY_PS(XMM3,XMM6)
2750           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2751           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2752           SSE_SUB_PS(XMM7,XMM3)
2753         SSE_INLINE_END_2
2754         v  += 16;
2755       }
2756       v    = aa + ai16;
2757       ai16 = 16*diag[--i];
2758       PREFETCH_NTA(aa+ai16+16);
2759       /*
2760          Scale the result by the diagonal 4x4 block,
2761          which was inverted as part of the factorization
2762       */
2763       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2764         /* First Column */
2765         SSE_COPY_PS(XMM0,XMM7)
2766         SSE_SHUFFLE(XMM0,XMM0,0x00)
2767         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2768 
2769         /* Second Column */
2770         SSE_COPY_PS(XMM1,XMM7)
2771         SSE_SHUFFLE(XMM1,XMM1,0x55)
2772         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2773         SSE_ADD_PS(XMM0,XMM1)
2774 
2775         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2776 
2777         /* Third Column */
2778         SSE_COPY_PS(XMM2,XMM7)
2779         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2780         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2781         SSE_ADD_PS(XMM0,XMM2)
2782 
2783         /* Fourth Column */
2784         SSE_COPY_PS(XMM3,XMM7)
2785         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2786         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2787         SSE_ADD_PS(XMM0,XMM3)
2788 
2789         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2790       SSE_INLINE_END_3
2791 
2792       v    = aa + ai16 + 16;
2793       idt -= 4;
2794     }
2795 
2796     /* Convert t from single precision back to double precision (inplace)*/
2797     idt = 4*(n-1);
2798     for (i=n-1;i>=0;i--) {
2799       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2800       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2801       PetscScalar *xtemp=&x[idt];
2802       MatScalar   *ttemp=&t[idt];
2803       xtemp[3] = (PetscScalar)ttemp[3];
2804       xtemp[2] = (PetscScalar)ttemp[2];
2805       xtemp[1] = (PetscScalar)ttemp[1];
2806       xtemp[0] = (PetscScalar)ttemp[0];
2807       idt -= 4;
2808     }
2809 
2810   } /* End of artificial scope. */
2811   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2812   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2813   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2814   SSE_SCOPE_END;
2815   PetscFunctionReturn(0);
2816 }
2817 
2818 #undef __FUNCT__
2819 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2820 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2821 {
2822   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2823   int            *aj=a->j;
2824   PetscErrorCode ierr;
2825   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2826   MatScalar      *aa=a->a;
2827   PetscScalar    *x,*b;
2828 
2829   PetscFunctionBegin;
2830   SSE_SCOPE_BEGIN;
2831   /*
2832      Note: This code currently uses demotion of double
2833      to float when performing the mixed-mode computation.
2834      This may not be numerically reasonable for all applications.
2835   */
2836   PREFETCH_NTA(aa+16*ai[1]);
2837 
2838   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2839   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2840   {
2841     /* x will first be computed in single precision then promoted inplace to double */
2842     MatScalar *v,*t=(MatScalar *)x;
2843     int       nz,i,idt,ai16;
2844     int       jdx,idx;
2845     int       *vi;
2846     /* Forward solve the lower triangular factor. */
2847 
2848     /* First block is the identity. */
2849     idx  = 0;
2850     CONVERT_DOUBLE4_FLOAT4(t,b);
2851     v    =  aa + 16*ai[1];
2852 
2853     for (i=1; i<n;) {
2854       PREFETCH_NTA(&v[8]);
2855       vi   =  aj      + ai[i];
2856       nz   =  diag[i] - ai[i];
2857       idx +=  4;
2858 
2859       /* Demote RHS from double to float. */
2860       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2861       LOAD_PS(&t[idx],XMM7);
2862 
2863       while (nz--) {
2864         PREFETCH_NTA(&v[16]);
2865         jdx = 4*(*vi++);
2866 /*          jdx = *vi++; */
2867 
2868         /* 4x4 Matrix-Vector product with negative accumulation: */
2869         SSE_INLINE_BEGIN_2(&t[jdx],v)
2870           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2871 
2872           /* First Column */
2873           SSE_COPY_PS(XMM0,XMM6)
2874           SSE_SHUFFLE(XMM0,XMM0,0x00)
2875           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2876           SSE_SUB_PS(XMM7,XMM0)
2877 
2878           /* Second Column */
2879           SSE_COPY_PS(XMM1,XMM6)
2880           SSE_SHUFFLE(XMM1,XMM1,0x55)
2881           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2882           SSE_SUB_PS(XMM7,XMM1)
2883 
2884           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2885 
2886           /* Third Column */
2887           SSE_COPY_PS(XMM2,XMM6)
2888           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2889           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2890           SSE_SUB_PS(XMM7,XMM2)
2891 
2892           /* Fourth Column */
2893           SSE_COPY_PS(XMM3,XMM6)
2894           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2895           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2896           SSE_SUB_PS(XMM7,XMM3)
2897         SSE_INLINE_END_2
2898 
2899         v  += 16;
2900       }
2901       v    =  aa + 16*ai[++i];
2902       PREFETCH_NTA(v);
2903       STORE_PS(&t[idx],XMM7);
2904     }
2905 
2906     /* Backward solve the upper triangular factor.*/
2907 
2908     idt  = 4*(n-1);
2909     ai16 = 16*diag[n-1];
2910     v    = aa + ai16 + 16;
2911     for (i=n-1; i>=0;){
2912       PREFETCH_NTA(&v[8]);
2913       vi = aj + diag[i] + 1;
2914       nz = ai[i+1] - diag[i] - 1;
2915 
2916       LOAD_PS(&t[idt],XMM7);
2917 
2918       while (nz--) {
2919         PREFETCH_NTA(&v[16]);
2920         idx = 4*(*vi++);
2921 /*          idx = *vi++; */
2922 
2923         /* 4x4 Matrix-Vector Product with negative accumulation: */
2924         SSE_INLINE_BEGIN_2(&t[idx],v)
2925           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2926 
2927           /* First Column */
2928           SSE_COPY_PS(XMM0,XMM6)
2929           SSE_SHUFFLE(XMM0,XMM0,0x00)
2930           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2931           SSE_SUB_PS(XMM7,XMM0)
2932 
2933           /* Second Column */
2934           SSE_COPY_PS(XMM1,XMM6)
2935           SSE_SHUFFLE(XMM1,XMM1,0x55)
2936           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2937           SSE_SUB_PS(XMM7,XMM1)
2938 
2939           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2940 
2941           /* Third Column */
2942           SSE_COPY_PS(XMM2,XMM6)
2943           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2944           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2945           SSE_SUB_PS(XMM7,XMM2)
2946 
2947           /* Fourth Column */
2948           SSE_COPY_PS(XMM3,XMM6)
2949           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2950           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2951           SSE_SUB_PS(XMM7,XMM3)
2952         SSE_INLINE_END_2
2953         v  += 16;
2954       }
2955       v    = aa + ai16;
2956       ai16 = 16*diag[--i];
2957       PREFETCH_NTA(aa+ai16+16);
2958       /*
2959          Scale the result by the diagonal 4x4 block,
2960          which was inverted as part of the factorization
2961       */
2962       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2963         /* First Column */
2964         SSE_COPY_PS(XMM0,XMM7)
2965         SSE_SHUFFLE(XMM0,XMM0,0x00)
2966         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2967 
2968         /* Second Column */
2969         SSE_COPY_PS(XMM1,XMM7)
2970         SSE_SHUFFLE(XMM1,XMM1,0x55)
2971         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2972         SSE_ADD_PS(XMM0,XMM1)
2973 
2974         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2975 
2976         /* Third Column */
2977         SSE_COPY_PS(XMM2,XMM7)
2978         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2979         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2980         SSE_ADD_PS(XMM0,XMM2)
2981 
2982         /* Fourth Column */
2983         SSE_COPY_PS(XMM3,XMM7)
2984         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2985         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2986         SSE_ADD_PS(XMM0,XMM3)
2987 
2988         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2989       SSE_INLINE_END_3
2990 
2991       v    = aa + ai16 + 16;
2992       idt -= 4;
2993     }
2994 
2995     /* Convert t from single precision back to double precision (inplace)*/
2996     idt = 4*(n-1);
2997     for (i=n-1;i>=0;i--) {
2998       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2999       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3000       PetscScalar *xtemp=&x[idt];
3001       MatScalar   *ttemp=&t[idt];
3002       xtemp[3] = (PetscScalar)ttemp[3];
3003       xtemp[2] = (PetscScalar)ttemp[2];
3004       xtemp[1] = (PetscScalar)ttemp[1];
3005       xtemp[0] = (PetscScalar)ttemp[0];
3006       idt -= 4;
3007     }
3008 
3009   } /* End of artificial scope. */
3010   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3011   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3012   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3013   SSE_SCOPE_END;
3014   PetscFunctionReturn(0);
3015 }
3016 
3017 #endif
3018 #undef __FUNCT__
3019 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3020 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3021 {
3022   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3023   IS                iscol=a->col,isrow=a->row;
3024   PetscErrorCode    ierr;
3025   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3026   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3027   const MatScalar   *aa=a->a,*v;
3028   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3029   const PetscScalar *b;
3030 
3031   PetscFunctionBegin;
3032   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3034   t  = a->solve_work;
3035 
3036   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3037   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3038 
3039   /* forward solve the lower triangular */
3040   idx    = 3*(*r++);
3041   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3042   for (i=1; i<n; i++) {
3043     v     = aa + 9*ai[i];
3044     vi    = aj + ai[i];
3045     nz    = diag[i] - ai[i];
3046     idx   = 3*(*r++);
3047     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3048     while (nz--) {
3049       idx   = 3*(*vi++);
3050       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3051       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3052       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3053       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3054       v += 9;
3055     }
3056     idx = 3*i;
3057     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3058   }
3059   /* backward solve the upper triangular */
3060   for (i=n-1; i>=0; i--){
3061     v    = aa + 9*diag[i] + 9;
3062     vi   = aj + diag[i] + 1;
3063     nz   = ai[i+1] - diag[i] - 1;
3064     idt  = 3*i;
3065     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3066     while (nz--) {
3067       idx   = 3*(*vi++);
3068       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3069       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3070       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3071       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3072       v += 9;
3073     }
3074     idc = 3*(*c--);
3075     v   = aa + 9*diag[i];
3076     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3077     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3078     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3079   }
3080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3082   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3084   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3085   PetscFunctionReturn(0);
3086 }
3087 
3088 /*
3089       Special case where the matrix was ILU(0) factored in the natural
3090    ordering. This eliminates the need for the column and row permutation.
3091 */
3092 #undef __FUNCT__
3093 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3094 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
3095 {
3096   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3097   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3098   PetscErrorCode    ierr;
3099   PetscInt          *diag = a->diag;
3100   const MatScalar   *aa=a->a,*v;
3101   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3102   const PetscScalar *b;
3103   PetscInt          jdx,idt,idx,nz,*vi,i;
3104 
3105   PetscFunctionBegin;
3106   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3107   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3108 
3109   /* forward solve the lower triangular */
3110   idx    = 0;
3111   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
3112   for (i=1; i<n; i++) {
3113     v     =  aa      + 9*ai[i];
3114     vi    =  aj      + ai[i];
3115     nz    =  diag[i] - ai[i];
3116     idx   +=  3;
3117     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
3118     while (nz--) {
3119       jdx   = 3*(*vi++);
3120       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3121       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3122       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3123       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3124       v    += 9;
3125     }
3126     x[idx]   = s1;
3127     x[1+idx] = s2;
3128     x[2+idx] = s3;
3129   }
3130   /* backward solve the upper triangular */
3131   for (i=n-1; i>=0; i--){
3132     v    = aa + 9*diag[i] + 9;
3133     vi   = aj + diag[i] + 1;
3134     nz   = ai[i+1] - diag[i] - 1;
3135     idt  = 3*i;
3136     s1 = x[idt];  s2 = x[1+idt];
3137     s3 = x[2+idt];
3138     while (nz--) {
3139       idx   = 3*(*vi++);
3140       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3141       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3142       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3143       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3144       v    += 9;
3145     }
3146     v        = aa +  9*diag[i];
3147     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3148     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3149     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3150   }
3151 
3152   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3153   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3154   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3155   PetscFunctionReturn(0);
3156 }
3157 
3158 #undef __FUNCT__
3159 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3160 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3161 {
3162     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3163     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3164     PetscErrorCode    ierr;
3165     PetscInt          idx,jdx,idt;
3166     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3167     const MatScalar   *aa=a->a,*v;
3168     PetscScalar       *x;
3169     const PetscScalar *b;
3170     PetscScalar        s1,s2,s3,x1,x2,x3;
3171 
3172     PetscFunctionBegin;
3173     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3174     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3175     /* forward solve the lower triangular */
3176     idx    = 0;
3177     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3178     for (i=1; i<n; i++) {
3179        v    = aa + bs2*ai[i];
3180        vi   = aj + ai[i];
3181        nz   = ai[i+1] - ai[i];
3182       idx   = bs*i;
3183        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3184        while (nz--) {
3185           jdx   = bs*(*vi++);
3186           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3187           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3188           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3189           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3190 
3191           v   +=  bs2;
3192         }
3193 
3194        x[idx]   = s1;
3195        x[1+idx] = s2;
3196        x[2+idx] = s3;
3197     }
3198 
3199    /* backward solve the upper triangular */
3200   for (i=n-1; i>=0; i--){
3201      v   = aa + bs2*ai[2*n-i];
3202      vi  = aj + ai[2*n-i];
3203      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3204      idt = bs*i;
3205      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3206 
3207     while (nz--) {
3208       idx   = bs*(*vi++);
3209        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3210        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3211        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3212        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3213 
3214         v   +=  bs2;
3215     }
3216     /* x = inv_diagonal*x */
3217    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3218    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3219    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3220 
3221   }
3222 
3223   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3224   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3225   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3226   PetscFunctionReturn(0);
3227 }
3228 
3229 #undef __FUNCT__
3230 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3231 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
3232 {
3233   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3234   IS                iscol=a->col,isrow=a->row;
3235   PetscErrorCode    ierr;
3236   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3237   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3238   const MatScalar   *aa=a->a,*v;
3239   PetscScalar       *x,s1,s2,x1,x2,*t;
3240   const PetscScalar *b;
3241 
3242   PetscFunctionBegin;
3243   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3244   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3245   t  = a->solve_work;
3246 
3247   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3248   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3249 
3250   /* forward solve the lower triangular */
3251   idx    = 2*(*r++);
3252   t[0] = b[idx]; t[1] = b[1+idx];
3253   for (i=1; i<n; i++) {
3254     v     = aa + 4*ai[i];
3255     vi    = aj + ai[i];
3256     nz    = diag[i] - ai[i];
3257     idx   = 2*(*r++);
3258     s1  = b[idx]; s2 = b[1+idx];
3259     while (nz--) {
3260       idx   = 2*(*vi++);
3261       x1    = t[idx]; x2 = t[1+idx];
3262       s1 -= v[0]*x1 + v[2]*x2;
3263       s2 -= v[1]*x1 + v[3]*x2;
3264       v += 4;
3265     }
3266     idx = 2*i;
3267     t[idx] = s1; t[1+idx] = s2;
3268   }
3269   /* backward solve the upper triangular */
3270   for (i=n-1; i>=0; i--){
3271     v    = aa + 4*diag[i] + 4;
3272     vi   = aj + diag[i] + 1;
3273     nz   = ai[i+1] - diag[i] - 1;
3274     idt  = 2*i;
3275     s1 = t[idt]; s2 = t[1+idt];
3276     while (nz--) {
3277       idx   = 2*(*vi++);
3278       x1    = t[idx]; x2 = t[1+idx];
3279       s1 -= v[0]*x1 + v[2]*x2;
3280       s2 -= v[1]*x1 + v[3]*x2;
3281       v += 4;
3282     }
3283     idc = 2*(*c--);
3284     v   = aa + 4*diag[i];
3285     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3286     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3287   }
3288   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3289   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3290   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3291   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3292   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3293   PetscFunctionReturn(0);
3294 }
3295 
3296 /*
3297       Special case where the matrix was ILU(0) factored in the natural
3298    ordering. This eliminates the need for the column and row permutation.
3299 */
3300 #undef __FUNCT__
3301 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3302 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
3303 {
3304   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3305   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3306   PetscErrorCode    ierr;
3307   PetscInt          *diag = a->diag;
3308   const MatScalar   *aa=a->a,*v;
3309   PetscScalar       *x,s1,s2,x1,x2;
3310   const PetscScalar *b;
3311   PetscInt          jdx,idt,idx,nz,*vi,i;
3312 
3313   PetscFunctionBegin;
3314   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3316 
3317   /* forward solve the lower triangular */
3318   idx    = 0;
3319   x[0]   = b[0]; x[1] = b[1];
3320   for (i=1; i<n; i++) {
3321     v     =  aa      + 4*ai[i];
3322     vi    =  aj      + ai[i];
3323     nz    =  diag[i] - ai[i];
3324     idx   +=  2;
3325     s1  =  b[idx];s2 = b[1+idx];
3326     while (nz--) {
3327       jdx   = 2*(*vi++);
3328       x1    = x[jdx];x2 = x[1+jdx];
3329       s1 -= v[0]*x1 + v[2]*x2;
3330       s2 -= v[1]*x1 + v[3]*x2;
3331       v    += 4;
3332     }
3333     x[idx]   = s1;
3334     x[1+idx] = s2;
3335   }
3336   /* backward solve the upper triangular */
3337   for (i=n-1; i>=0; i--){
3338     v    = aa + 4*diag[i] + 4;
3339     vi   = aj + diag[i] + 1;
3340     nz   = ai[i+1] - diag[i] - 1;
3341     idt  = 2*i;
3342     s1 = x[idt];  s2 = x[1+idt];
3343     while (nz--) {
3344       idx   = 2*(*vi++);
3345       x1    = x[idx];   x2 = x[1+idx];
3346       s1 -= v[0]*x1 + v[2]*x2;
3347       s2 -= v[1]*x1 + v[3]*x2;
3348       v    += 4;
3349     }
3350     v        = aa +  4*diag[i];
3351     x[idt]   = v[0]*s1 + v[2]*s2;
3352     x[1+idt] = v[1]*s1 + v[3]*s2;
3353   }
3354 
3355   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3356   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3357   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 #undef __FUNCT__
3362 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3363 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3364 {
3365     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3366     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3367     PetscErrorCode    ierr;
3368     PetscInt          jdx;
3369     const MatScalar   *aa=a->a,*v;
3370     PetscScalar       *x,s1,s2,x1,x2;
3371     const PetscScalar *b;
3372 
3373     PetscFunctionBegin;
3374     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3375     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3376     /* forward solve the lower triangular */
3377     idx    = 0;
3378     x[0] = b[idx]; x[1] = b[1+idx];
3379     for (i=1; i<n; i++) {
3380         v   = aa + 4*ai[i];
3381        vi   = aj + ai[i];
3382        nz   = ai[i+1] - ai[i];
3383        idx  = 2*i;
3384        s1   = b[idx];s2 = b[1+idx];
3385        while (nz--) {
3386           jdx   = 2*(*vi++);
3387           x1    = x[jdx];x2 = x[1+jdx];
3388           s1   -= v[0]*x1 + v[2]*x2;
3389           s2   -= v[1]*x1 + v[3]*x2;
3390            v   +=  4;
3391         }
3392        x[idx]   = s1;
3393        x[1+idx] = s2;
3394     }
3395 
3396    /* backward solve the upper triangular */
3397   for (i=n-1; i>=0; i--){
3398      v   = aa + 4*ai[2*n-i];
3399      vi  = aj + ai[2*n-i];
3400      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3401      idt = 2*i;
3402      s1 = x[idt];  s2 = x[1+idt];
3403      while (nz--) {
3404       idx   = 2*(*vi++);
3405        x1    = x[idx];   x2 = x[1+idx];
3406        s1 -= v[0]*x1 + v[2]*x2;
3407        s2 -= v[1]*x1 + v[3]*x2;
3408          v    += 4;
3409     }
3410     /* x = inv_diagonal*x */
3411    x[idt]   = v[0]*s1 + v[2]*s2;
3412    x[1+idt] = v[1]*s1 + v[3]*s2;
3413   }
3414 
3415   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3416   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3417   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3418   PetscFunctionReturn(0);
3419 }
3420 
3421 #undef __FUNCT__
3422 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3423 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
3424 {
3425   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3426   IS             iscol=a->col,isrow=a->row;
3427   PetscErrorCode ierr;
3428   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3429   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3430   MatScalar      *aa=a->a,*v;
3431   PetscScalar    *x,*b,s1,*t;
3432 
3433   PetscFunctionBegin;
3434   if (!n) PetscFunctionReturn(0);
3435 
3436   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3437   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3438   t  = a->solve_work;
3439 
3440   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3441   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3442 
3443   /* forward solve the lower triangular */
3444   t[0] = b[*r++];
3445   for (i=1; i<n; i++) {
3446     v     = aa + ai[i];
3447     vi    = aj + ai[i];
3448     nz    = diag[i] - ai[i];
3449     s1  = b[*r++];
3450     while (nz--) {
3451       s1 -= (*v++)*t[*vi++];
3452     }
3453     t[i] = s1;
3454   }
3455   /* backward solve the upper triangular */
3456   for (i=n-1; i>=0; i--){
3457     v    = aa + diag[i] + 1;
3458     vi   = aj + diag[i] + 1;
3459     nz   = ai[i+1] - diag[i] - 1;
3460     s1 = t[i];
3461     while (nz--) {
3462       s1 -= (*v++)*t[*vi++];
3463     }
3464     x[*c--] = t[i] = aa[diag[i]]*s1;
3465   }
3466 
3467   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3468   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3469   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3470   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3471   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3472   PetscFunctionReturn(0);
3473 }
3474 /*
3475       Special case where the matrix was ILU(0) factored in the natural
3476    ordering. This eliminates the need for the column and row permutation.
3477 */
3478 #undef __FUNCT__
3479 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3480 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3481 {
3482   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3483   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3484   PetscErrorCode ierr;
3485   PetscInt       *diag = a->diag;
3486   MatScalar      *aa=a->a;
3487   PetscScalar    *x,*b;
3488   PetscScalar    s1,x1;
3489   MatScalar      *v;
3490   PetscInt       jdx,idt,idx,nz,*vi,i;
3491 
3492   PetscFunctionBegin;
3493   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3494   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3495 
3496   /* forward solve the lower triangular */
3497   idx    = 0;
3498   x[0]   = b[0];
3499   for (i=1; i<n; i++) {
3500     v     =  aa      + ai[i];
3501     vi    =  aj      + ai[i];
3502     nz    =  diag[i] - ai[i];
3503     idx   +=  1;
3504     s1  =  b[idx];
3505     while (nz--) {
3506       jdx   = *vi++;
3507       x1    = x[jdx];
3508       s1 -= v[0]*x1;
3509       v    += 1;
3510     }
3511     x[idx]   = s1;
3512   }
3513   /* backward solve the upper triangular */
3514   for (i=n-1; i>=0; i--){
3515     v    = aa + diag[i] + 1;
3516     vi   = aj + diag[i] + 1;
3517     nz   = ai[i+1] - diag[i] - 1;
3518     idt  = i;
3519     s1 = x[idt];
3520     while (nz--) {
3521       idx   = *vi++;
3522       x1    = x[idx];
3523       s1 -= v[0]*x1;
3524       v    += 1;
3525     }
3526     v        = aa +  diag[i];
3527     x[idt]   = v[0]*s1;
3528   }
3529   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3530   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3531   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3532   PetscFunctionReturn(0);
3533 }
3534 
3535 /* ----------------------------------------------------------------*/
3536 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
3537 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3538 
3539 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
3540 #undef __FUNCT__
3541 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
3542 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
3543 {
3544   Mat            C=B;
3545   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
3546   IS             isrow = b->row,isicol = b->icol;
3547   PetscErrorCode ierr;
3548   const PetscInt *r,*ic,*ics;
3549   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
3550   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3551   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3552   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3553   MatScalar      *v_work;
3554 
3555   PetscFunctionBegin;
3556   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3557   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3558   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3559   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
3560   ics  = ic;
3561 
3562   /* generate work space needed by dense LU factorization */
3563   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3564   multiplier = v_work + bs;
3565   v_pivots   = (PetscInt*)(multiplier + bs2);
3566 
3567   for (i=0; i<n; i++){
3568     /* zero rtmp */
3569     /* L part */
3570     nz    = bi[i+1] - bi[i];
3571     bjtmp = bj + bi[i];
3572     for  (j=0; j<nz; j++){
3573       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3574     }
3575 
3576     /* U part */
3577     nz = bi[2*n-i+1] - bi[2*n-i];
3578     bjtmp = bj + bi[2*n-i];
3579     for  (j=0; j<nz; j++){
3580       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3581     }
3582 
3583     /* load in initial (unfactored row) */
3584     nz    = ai[r[i]+1] - ai[r[i]];
3585     ajtmp = aj + ai[r[i]];
3586     v     = aa + bs2*ai[r[i]];
3587     for (j=0; j<nz; j++) {
3588       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
3589     }
3590 
3591     /* elimination */
3592     bjtmp = bj + bi[i];
3593     row   = *bjtmp++;
3594     nzL   = bi[i+1] - bi[i];
3595     k   = 0;
3596     while  (k < nzL) {
3597       pc = rtmp + bs2*row;
3598       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3599       if (flg) {
3600         pv         = b->a + bs2*bdiag[row];
3601         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
3602         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3603         pv         = b->a + bs2*bi[2*n-row];
3604         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3605         for (j=0; j<nz; j++) {
3606           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3607         }
3608         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
3609       }
3610       row = *bjtmp++; k++;
3611     }
3612 
3613     /* finished row so stick it into b->a */
3614     /* L part */
3615     pv   = b->a + bs2*bi[i] ;
3616     pj   = b->j + bi[i] ;
3617     nz   = bi[i+1] - bi[i];
3618     for (j=0; j<nz; j++) {
3619       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3620     }
3621 
3622     /* Mark diagonal and invert diagonal for simplier triangular solves */
3623     pv  = b->a + bs2*bdiag[i];
3624     pj  = b->j + bdiag[i];
3625     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3626     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3627     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
3628 
3629     /* U part */
3630     pv = b->a + bs2*bi[2*n-i];
3631     pj = b->j + bi[2*n-i];
3632     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3633     for (j=0; j<nz; j++){
3634       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3635     }
3636   }
3637 
3638   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3639   ierr = PetscFree(v_work);CHKERRQ(ierr);
3640   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3641   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3642 
3643   C->assembled = PETSC_TRUE;
3644   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
3645   PetscFunctionReturn(0);
3646 }
3647 
3648 /*
3649    ilu(0) with natural ordering under new data structure.
3650    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
3651    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
3652 */
3653 #undef __FUNCT__
3654 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
3655 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3656 {
3657 
3658   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3659   PetscErrorCode     ierr;
3660   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
3661   PetscInt           i,j,nz,*bi,*bj,*bdiag;
3662 
3663   PetscFunctionBegin;
3664   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
3665   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
3666   b    = (Mat_SeqBAIJ*)(fact)->data;
3667 
3668   /* allocate matrix arrays for new data structure */
3669   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
3670   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
3671   b->singlemalloc = PETSC_TRUE;
3672   if (!b->diag){
3673     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
3674   }
3675   bdiag = b->diag;
3676 
3677   if (n > 0) {
3678     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
3679   }
3680 
3681   /* set bi and bj with new data structure */
3682   bi = b->i;
3683   bj = b->j;
3684 
3685   /* L part */
3686   bi[0] = 0;
3687   for (i=0; i<n; i++){
3688     nz = adiag[i] - ai[i];
3689     bi[i+1] = bi[i] + nz;
3690     aj = a->j + ai[i];
3691     for (j=0; j<nz; j++){
3692       *bj = aj[j]; bj++;
3693     }
3694   }
3695 
3696   /* U part */
3697   bi[n+1] = bi[n];
3698   for (i=n-1; i>=0; i--){
3699     nz = ai[i+1] - adiag[i] - 1;
3700     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
3701     aj = a->j + adiag[i] + 1;
3702     for (j=0; j<nz; j++){
3703       *bj = aj[j]; bj++;
3704     }
3705     /* diag[i] */
3706     *bj = i; bj++;
3707     bdiag[i] = bi[2*n-i+1]-1;
3708   }
3709   PetscFunctionReturn(0);
3710 }
3711 
3712 extern PetscErrorCode PetscFreeSpaceContiguous_newdatastruct(PetscFreeSpaceList *,PetscInt *,PetscInt,PetscInt *,PetscInt *);
3713 #undef __FUNCT__
3714 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
3715 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3716 {
3717   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3718   IS                 isicol;
3719   PetscErrorCode     ierr;
3720   const PetscInt     *r,*ic;
3721   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
3722   PetscInt           *bi,*cols,nnz,*cols_lvl;
3723   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
3724   PetscInt           i,levels,diagonal_fill;
3725   PetscTruth         col_identity,row_identity,both_identity;
3726   PetscReal          f;
3727   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
3728   PetscBT            lnkbt;
3729   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
3730   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
3731   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
3732   PetscTruth         missing;
3733   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
3734 
3735   PetscFunctionBegin;
3736   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
3737   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
3738   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
3739 
3740   f             = info->fill;
3741   levels        = (PetscInt)info->levels;
3742   diagonal_fill = (PetscInt)info->diagonal_fill;
3743   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3744 
3745   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3746   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3747   both_identity = (PetscTruth) (row_identity && col_identity);
3748 
3749   if (!levels && both_identity) {
3750     /* special case: ilu(0) with natural ordering */
3751     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
3752     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3753     /* set MatSolve routines */
3754     switch (bs){
3755     case 2:
3756       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3757       break;
3758     case 3:
3759       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3760       break;
3761     case 4:
3762       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3763       break;
3764     case 5:
3765       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3766       break;
3767     case 6:
3768       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3769       break;
3770     case 7:
3771       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3772       break;
3773     default:
3774       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3775       break;
3776     }
3777 
3778     fact->factor = MAT_FACTOR_ILU;
3779     (fact)->info.factor_mallocs    = 0;
3780     (fact)->info.fill_ratio_given  = info->fill;
3781     (fact)->info.fill_ratio_needed = 1.0;
3782     b                = (Mat_SeqBAIJ*)(fact)->data;
3783     b->row           = isrow;
3784     b->col           = iscol;
3785     b->icol          = isicol;
3786     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3787     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3788     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3789     ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3790     PetscFunctionReturn(0);
3791   }
3792 
3793   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3794   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3795 
3796   /* get new row pointers */
3797   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
3798   bi[0] = 0;
3799   /* bdiag is location of diagonal in factor */
3800   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
3801   bdiag[0]  = 0;
3802 
3803   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
3804   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
3805 
3806   /* create a linked list for storing column indices of the active row */
3807   nlnk = n + 1;
3808   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
3809 
3810   /* initial FreeSpace size is f*(ai[n]+1) */
3811   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
3812   current_space = free_space;
3813   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
3814   current_space_lvl = free_space_lvl;
3815 
3816   for (i=0; i<n; i++) {
3817     nzi = 0;
3818     /* copy current row into linked list */
3819     nnz  = ai[r[i]+1] - ai[r[i]];
3820     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
3821     cols = aj + ai[r[i]];
3822     lnk[i] = -1; /* marker to indicate if diagonal exists */
3823     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
3824     nzi += nlnk;
3825 
3826     /* make sure diagonal entry is included */
3827     if (diagonal_fill && lnk[i] == -1) {
3828       fm = n;
3829       while (lnk[fm] < i) fm = lnk[fm];
3830       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
3831       lnk[fm]    = i;
3832       lnk_lvl[i] = 0;
3833       nzi++; dcount++;
3834     }
3835 
3836     /* add pivot rows into the active row */
3837     nzbd = 0;
3838     prow = lnk[n];
3839     while (prow < i) {
3840       nnz      = bdiag[prow];
3841       cols     = bj_ptr[prow] + nnz + 1;
3842       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
3843       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
3844       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
3845       nzi += nlnk;
3846       prow = lnk[prow];
3847       nzbd++;
3848     }
3849     bdiag[i] = nzbd;
3850     bi[i+1]  = bi[i] + nzi;
3851 
3852     /* if free space is not available, make more free space */
3853     if (current_space->local_remaining<nzi) {
3854       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
3855       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
3856       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
3857       reallocs++;
3858     }
3859 
3860     /* copy data into free_space and free_space_lvl, then initialize lnk */
3861     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
3862     bj_ptr[i]    = current_space->array;
3863     bjlvl_ptr[i] = current_space_lvl->array;
3864 
3865     /* make sure the active row i has diagonal entry */
3866     if (*(bj_ptr[i]+bdiag[i]) != i) {
3867       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3868     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
3869     }
3870 
3871     current_space->array           += nzi;
3872     current_space->local_used      += nzi;
3873     current_space->local_remaining -= nzi;
3874     current_space_lvl->array           += nzi;
3875     current_space_lvl->local_used      += nzi;
3876     current_space_lvl->local_remaining -= nzi;
3877   }
3878 
3879   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3880   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3881 
3882   /* destroy list of free space and other temporary arrays */
3883   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
3884 
3885   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
3886   ierr = PetscFreeSpaceContiguous_newdatastruct(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
3887 
3888   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
3889   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
3890   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
3891 
3892 #if defined(PETSC_USE_INFO)
3893   {
3894     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
3895     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
3896     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3897     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
3898     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3899     if (diagonal_fill) {
3900       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
3901     }
3902   }
3903 #endif
3904 
3905   /* put together the new matrix */
3906   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3907   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3908   b = (Mat_SeqBAIJ*)(fact)->data;
3909   b->free_a       = PETSC_TRUE;
3910   b->free_ij      = PETSC_TRUE;
3911   b->singlemalloc = PETSC_FALSE;
3912   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3913   b->j          = bj;
3914   b->i          = bi;
3915   b->diag       = bdiag;
3916   b->ilen       = 0;
3917   b->imax       = 0;
3918   b->row        = isrow;
3919   b->col        = iscol;
3920   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3921   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3922   b->icol       = isicol;
3923   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3924   /* In b structure:  Free imax, ilen, old a, old j.
3925      Allocate bdiag, solve_work, new a, new j */
3926   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
3927   b->maxnz = b->nz = bi[2*n+1] ;
3928   (fact)->info.factor_mallocs    = reallocs;
3929   (fact)->info.fill_ratio_given  = f;
3930   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
3931   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3932   /* set MatSolve routines */
3933   if (both_identity){
3934     switch (bs){
3935     case 2:
3936       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3937       break;
3938     case 3:
3939       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3940       break;
3941     case 4:
3942       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3943       break;
3944     case 5:
3945       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3946       break;
3947     case 6:
3948       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3949       break;
3950     case 7:
3951       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3952       break;
3953     default:
3954       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3955       break;
3956     }
3957   } else {
3958     switch (bs){
3959       /* not implemented yet!
3960     case 2:
3961       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
3962       break;
3963     case 3:
3964       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
3965       break;
3966     case 4:
3967       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
3968       break;
3969     case 5:
3970       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
3971       break;
3972     case 6:
3973       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
3974       break;
3975     case 7:
3976       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
3977       break;
3978     default:
3979       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
3980       break;
3981       */
3982     }
3983   }
3984   PetscFunctionReturn(0);
3985 }
3986 
3987 /*
3988      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3989    except that the data structure of Mat_SeqAIJ is slightly different.
3990    Not a good example of code reuse.
3991 */
3992 #undef __FUNCT__
3993 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3994 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3995 {
3996   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3997   IS             isicol;
3998   PetscErrorCode ierr;
3999   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
4000   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4001   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4002   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
4003   PetscTruth     col_identity,row_identity,both_identity,flg;
4004   PetscReal      f;
4005   PetscTruth     newdatastruct=PETSC_FALSE;
4006 
4007   PetscFunctionBegin;
4008   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
4009   if (newdatastruct){
4010     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4011     PetscFunctionReturn(0);
4012   }
4013 
4014   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
4015   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
4016 
4017   f             = info->fill;
4018   levels        = (PetscInt)info->levels;
4019   diagonal_fill = (PetscInt)info->diagonal_fill;
4020   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4021 
4022   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4023   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4024   both_identity = (PetscTruth) (row_identity && col_identity);
4025 
4026   if (!levels && both_identity) {  /* special case copy the nonzero structure */
4027     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
4028     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4029 
4030     fact->factor = MAT_FACTOR_ILU;
4031     b            = (Mat_SeqBAIJ*)(fact)->data;
4032     b->row       = isrow;
4033     b->col       = iscol;
4034     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4035     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4036     b->icol      = isicol;
4037     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4038     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4039     PetscFunctionReturn(0);
4040   }
4041 
4042   /* general case perform the symbolic factorization */
4043     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4044     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4045 
4046     /* get new row pointers */
4047     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
4048     ainew[0] = 0;
4049     /* don't know how many column pointers are needed so estimate */
4050     jmax = (PetscInt)(f*ai[n] + 1);
4051     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
4052     /* ajfill is level of fill for each fill entry */
4053     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
4054     /* fill is a linked list of nonzeros in active row */
4055     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
4056     /* im is level for each filled value */
4057     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
4058     /* dloc is location of diagonal in factor */
4059     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
4060     dloc[0]  = 0;
4061     for (prow=0; prow<n; prow++) {
4062 
4063       /* copy prow into linked list */
4064       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
4065       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
4066       xi         = aj + ai[r[prow]];
4067       fill[n]    = n;
4068       fill[prow] = -1; /* marker for diagonal entry */
4069       while (nz--) {
4070 	fm  = n;
4071 	idx = ic[*xi++];
4072 	do {
4073 	  m  = fm;
4074 	  fm = fill[m];
4075 	} while (fm < idx);
4076 	fill[m]   = idx;
4077 	fill[idx] = fm;
4078 	im[idx]   = 0;
4079       }
4080 
4081       /* make sure diagonal entry is included */
4082       if (diagonal_fill && fill[prow] == -1) {
4083 	fm = n;
4084 	while (fill[fm] < prow) fm = fill[fm];
4085 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4086 	fill[fm]   = prow;
4087 	im[prow]   = 0;
4088 	nzf++;
4089 	dcount++;
4090       }
4091 
4092       nzi = 0;
4093       row = fill[n];
4094       while (row < prow) {
4095 	incrlev = im[row] + 1;
4096 	nz      = dloc[row];
4097 	xi      = ajnew  + ainew[row] + nz + 1;
4098 	flev    = ajfill + ainew[row] + nz + 1;
4099 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
4100 	fm      = row;
4101 	while (nnz-- > 0) {
4102 	  idx = *xi++;
4103 	  if (*flev + incrlev > levels) {
4104 	    flev++;
4105 	    continue;
4106 	  }
4107 	  do {
4108 	    m  = fm;
4109 	    fm = fill[m];
4110 	  } while (fm < idx);
4111 	  if (fm != idx) {
4112 	    im[idx]   = *flev + incrlev;
4113 	    fill[m]   = idx;
4114 	    fill[idx] = fm;
4115 	    fm        = idx;
4116 	    nzf++;
4117 	  } else {
4118 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
4119 	  }
4120 	  flev++;
4121 	}
4122 	row = fill[row];
4123 	nzi++;
4124       }
4125       /* copy new filled row into permanent storage */
4126       ainew[prow+1] = ainew[prow] + nzf;
4127       if (ainew[prow+1] > jmax) {
4128 
4129 	/* estimate how much additional space we will need */
4130 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4131 	/* just double the memory each time */
4132 	PetscInt maxadd = jmax;
4133 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
4134 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
4135 	jmax += maxadd;
4136 
4137 	/* allocate a longer ajnew and ajfill */
4138 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4139 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4140 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
4141 	ajnew = xitmp;
4142 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4143 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4144 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
4145 	ajfill = xitmp;
4146 	reallocate++; /* count how many reallocations are needed */
4147       }
4148       xitmp       = ajnew + ainew[prow];
4149       flev        = ajfill + ainew[prow];
4150       dloc[prow]  = nzi;
4151       fm          = fill[n];
4152       while (nzf--) {
4153 	*xitmp++ = fm;
4154 	*flev++ = im[fm];
4155 	fm      = fill[fm];
4156       }
4157       /* make sure row has diagonal entry */
4158       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
4159 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4160     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4161       }
4162     }
4163     ierr = PetscFree(ajfill);CHKERRQ(ierr);
4164     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4165     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4166     ierr = PetscFree(fill);CHKERRQ(ierr);
4167     ierr = PetscFree(im);CHKERRQ(ierr);
4168 
4169 #if defined(PETSC_USE_INFO)
4170     {
4171       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4172       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4173       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4174       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4175       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4176       if (diagonal_fill) {
4177 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4178       }
4179     }
4180 #endif
4181 
4182     /* put together the new matrix */
4183     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4184     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4185     b    = (Mat_SeqBAIJ*)(fact)->data;
4186     b->free_a       = PETSC_TRUE;
4187     b->free_ij      = PETSC_TRUE;
4188     b->singlemalloc = PETSC_FALSE;
4189     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4190     b->j          = ajnew;
4191     b->i          = ainew;
4192     for (i=0; i<n; i++) dloc[i] += ainew[i];
4193     b->diag       = dloc;
4194     b->ilen       = 0;
4195     b->imax       = 0;
4196     b->row        = isrow;
4197     b->col        = iscol;
4198     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4199     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4200     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4201     b->icol       = isicol;
4202     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4203     /* In b structure:  Free imax, ilen, old a, old j.
4204        Allocate dloc, solve_work, new a, new j */
4205     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
4206     b->maxnz          = b->nz = ainew[n];
4207 
4208     (fact)->info.factor_mallocs    = reallocate;
4209     (fact)->info.fill_ratio_given  = f;
4210     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
4211 
4212   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4213   PetscFunctionReturn(0);
4214 }
4215 
4216 #undef __FUNCT__
4217 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4218 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
4219 {
4220   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
4221   /* int i,*AJ=a->j,nz=a->nz; */
4222   PetscFunctionBegin;
4223   /* Undo Column scaling */
4224 /*    while (nz--) { */
4225 /*      AJ[i] = AJ[i]/4; */
4226 /*    } */
4227   /* This should really invoke a push/pop logic, but we don't have that yet. */
4228   A->ops->setunfactored = PETSC_NULL;
4229   PetscFunctionReturn(0);
4230 }
4231 
4232 #undef __FUNCT__
4233 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4234 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
4235 {
4236   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4237   PetscInt       *AJ=a->j,nz=a->nz;
4238   unsigned short *aj=(unsigned short *)AJ;
4239   PetscFunctionBegin;
4240   /* Is this really necessary? */
4241   while (nz--) {
4242     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
4243   }
4244   A->ops->setunfactored = PETSC_NULL;
4245   PetscFunctionReturn(0);
4246 }
4247 
4248 
4249