xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 485e8061b00aa6aa607af9dbdc9c890e58f9f145)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 /* ----------------------------------------------------------- */
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
1179 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1180 {
1181   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1182   IS                iscol=a->col,isrow=a->row;
1183   PetscErrorCode    ierr;
1184   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1185   PetscInt          i,n=a->mbs,j;
1186   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1187   const MatScalar   *aa=a->a,*v;
1188   PetscScalar       *x,*t,*ls;
1189   const PetscScalar *b;
1190   PetscFunctionBegin;
1191   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1193   t    = a->solve_work;
1194 
1195   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1196   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1197 
1198   /* copy the b into temp work space according to permutation */
1199   for (i=0; i<n; i++) {
1200     for (j=0; j<bs; j++) {
1201       t[i*bs+j] = b[c[i]*bs+j];
1202     }
1203   }
1204 
1205 
1206   /* forward solve the upper triangular transpose */
1207   ls = a->solve_work + A->cmap->n;
1208   for (i=0; i<n; i++){
1209     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1210     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1211     v   = aa + bs2*(a->diag[i] + 1);
1212     vi  = aj + a->diag[i] + 1;
1213     nz  = ai[i+1] - a->diag[i] - 1;
1214     while (nz--) {
1215       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1216       v += bs2;
1217     }
1218   }
1219 
1220   /* backward solve the lower triangular transpose */
1221   for (i=n-1; i>=0; i--) {
1222     v   = aa + bs2*ai[i];
1223     vi  = aj + ai[i];
1224     nz  = a->diag[i] - ai[i];
1225     while (nz--) {
1226       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1227       v += bs2;
1228     }
1229   }
1230 
1231   /* copy t into x according to permutation */
1232   for (i=0; i<n; i++) {
1233     for (j=0; j<bs; j++) {
1234       x[bs*r[i]+j]   = t[bs*i+j];
1235     }
1236   }
1237 
1238   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1239   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1241   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1242   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNCT__
1247 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1248 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1249 {
1250   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1251   IS             iscol=a->col,isrow=a->row;
1252   PetscErrorCode ierr;
1253   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1254   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1255   MatScalar      *aa=a->a,*v;
1256   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1257   PetscScalar    *x,*b,*t;
1258 
1259   PetscFunctionBegin;
1260   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1261   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1262   t  = a->solve_work;
1263 
1264   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1265   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1266 
1267   /* forward solve the lower triangular */
1268   idx    = 7*(*r++);
1269   t[0] = b[idx];   t[1] = b[1+idx];
1270   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1271   t[5] = b[5+idx]; t[6] = b[6+idx];
1272 
1273   for (i=1; i<n; i++) {
1274     v     = aa + 49*ai[i];
1275     vi    = aj + ai[i];
1276     nz    = diag[i] - ai[i];
1277     idx   = 7*(*r++);
1278     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1279     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1280     while (nz--) {
1281       idx   = 7*(*vi++);
1282       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1283       x4    = t[3+idx];x5 = t[4+idx];
1284       x6    = t[5+idx];x7 = t[6+idx];
1285       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1286       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1287       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1288       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1289       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1290       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1291       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1292       v += 49;
1293     }
1294     idx = 7*i;
1295     t[idx]   = s1;t[1+idx] = s2;
1296     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1297     t[5+idx] = s6;t[6+idx] = s7;
1298   }
1299   /* backward solve the upper triangular */
1300   for (i=n-1; i>=0; i--){
1301     v    = aa + 49*diag[i] + 49;
1302     vi   = aj + diag[i] + 1;
1303     nz   = ai[i+1] - diag[i] - 1;
1304     idt  = 7*i;
1305     s1 = t[idt];  s2 = t[1+idt];
1306     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1307     s6 = t[5+idt];s7 = t[6+idt];
1308     while (nz--) {
1309       idx   = 7*(*vi++);
1310       x1    = t[idx];   x2 = t[1+idx];
1311       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1312       x6    = t[5+idx]; x7 = t[6+idx];
1313       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1314       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1315       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1316       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1317       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1318       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1319       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1320       v += 49;
1321     }
1322     idc = 7*(*c--);
1323     v   = aa + 49*diag[i];
1324     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1325                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1326     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1327                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1328     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1329                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1330     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1331                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1332     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1333                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1334     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1335                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1336     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1337                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1338   }
1339 
1340   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1341   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1343   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1344   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 #undef __FUNCT__
1349 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1350 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1351 {
1352   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1353   IS             iscol=a->col,isrow=a->row;
1354   PetscErrorCode ierr;
1355   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1356   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1357   MatScalar      *aa=a->a,*v;
1358   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1359   PetscScalar    *x,*b,*t;
1360 
1361   PetscFunctionBegin;
1362   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1363   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1364   t  = a->solve_work;
1365 
1366   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1367   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1368 
1369   /* forward solve the lower triangular */
1370   idx    = 7*r[0];
1371   t[0] = b[idx];   t[1] = b[1+idx];
1372   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1373   t[5] = b[5+idx]; t[6] = b[6+idx];
1374 
1375   for (i=1; i<n; i++) {
1376     v     = aa + 49*ai[i];
1377     vi    = aj + ai[i];
1378     nz    = ai[i+1] - ai[i];
1379     idx   = 7*r[i];
1380     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1381     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1382     for(m=0;m<nz;m++){
1383       idx   = 7*vi[m];
1384       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1385       x4    = t[3+idx];x5 = t[4+idx];
1386       x6    = t[5+idx];x7 = t[6+idx];
1387       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1388       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1389       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1390       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1391       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1392       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1393       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1394       v += 49;
1395     }
1396     idx = 7*i;
1397     t[idx]   = s1;t[1+idx] = s2;
1398     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1399     t[5+idx] = s6;t[6+idx] = s7;
1400   }
1401   /* backward solve the upper triangular */
1402   for (i=n-1; i>=0; i--){
1403     k    = 2*n-i;
1404     v    = aa + 49*ai[k];
1405     vi   = aj + ai[k];
1406     nz   = ai[k+1] - ai[k] - 1;
1407     idt  = 7*i;
1408     s1 = t[idt];  s2 = t[1+idt];
1409     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1410     s6 = t[5+idt];s7 = t[6+idt];
1411     for(m=0;m<nz;m++){
1412       idx   = 7*vi[m];
1413       x1    = t[idx];   x2 = t[1+idx];
1414       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1415       x6    = t[5+idx]; x7 = t[6+idx];
1416       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1417       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1418       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1419       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1420       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1421       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1422       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1423       v += 49;
1424     }
1425     idc = 7*c[i];
1426     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1427                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1428     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1429                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1430     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1431                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1432     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1433                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1434     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1435                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1436     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1437                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1438     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1439                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1440   }
1441 
1442   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1443   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1444   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1445   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1446   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1447   PetscFunctionReturn(0);
1448 }
1449 
1450 #undef __FUNCT__
1451 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2"
1452 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1453 {
1454   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1455   IS             iscol=a->col,isrow=a->row;
1456   PetscErrorCode ierr;
1457   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1458   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1459   MatScalar      *aa=a->a,*v;
1460   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1461   PetscScalar    *x,*b,*t;
1462 
1463   PetscFunctionBegin;
1464   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1466   t  = a->solve_work;
1467 
1468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1470 
1471   /* forward solve the lower triangular */
1472   idx    = 7*r[0];
1473   t[0] = b[idx];   t[1] = b[1+idx];
1474   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1475   t[5] = b[5+idx]; t[6] = b[6+idx];
1476 
1477   for (i=1; i<n; i++) {
1478     v     = aa + 49*ai[i];
1479     vi    = aj + ai[i];
1480     nz    = ai[i+1] - ai[i];
1481     idx   = 7*r[i];
1482     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1483     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1484     for(m=0;m<nz;m++){
1485       idx   = 7*vi[m];
1486       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1487       x4    = t[3+idx];x5 = t[4+idx];
1488       x6    = t[5+idx];x7 = t[6+idx];
1489       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1490       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1491       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1492       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1493       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1494       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1495       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1496       v += 49;
1497     }
1498     idx = 7*i;
1499     t[idx]   = s1;t[1+idx] = s2;
1500     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1501     t[5+idx] = s6;t[6+idx] = s7;
1502   }
1503   /* backward solve the upper triangular */
1504   for (i=n-1; i>=0; i--){
1505     v    = aa + 49*(adiag[i+1]+1);
1506     vi   = aj + adiag[i+1]+1;
1507     nz   = adiag[i] - adiag[i+1] - 1;
1508     idt  = 7*i;
1509     s1 = t[idt];  s2 = t[1+idt];
1510     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1511     s6 = t[5+idt];s7 = t[6+idt];
1512     for(m=0;m<nz;m++){
1513       idx   = 7*vi[m];
1514       x1    = t[idx];   x2 = t[1+idx];
1515       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1516       x6    = t[5+idx]; x7 = t[6+idx];
1517       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1518       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1519       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1520       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1521       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1522       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1523       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1524       v += 49;
1525     }
1526     idc = 7*c[i];
1527     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1528                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1529     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1530                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1531     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1532                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1533     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1534                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1535     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1536                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1537     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1538                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1539     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1540                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1541   }
1542 
1543   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1544   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1545   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1546   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1547   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 #undef __FUNCT__
1552 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1553 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1554 {
1555   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1556   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1557   PetscErrorCode    ierr;
1558   PetscInt          *diag = a->diag,jdx;
1559   const MatScalar   *aa=a->a,*v;
1560   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1561   const PetscScalar *b;
1562 
1563   PetscFunctionBegin;
1564   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1566   /* forward solve the lower triangular */
1567   idx    = 0;
1568   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1569   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1570   x[6] = b[6+idx];
1571   for (i=1; i<n; i++) {
1572     v     =  aa + 49*ai[i];
1573     vi    =  aj + ai[i];
1574     nz    =  diag[i] - ai[i];
1575     idx   =  7*i;
1576     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1577     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1578     s7  =  b[6+idx];
1579     while (nz--) {
1580       jdx   = 7*(*vi++);
1581       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1582       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1583       x7    = x[6+jdx];
1584       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1585       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1586       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1587       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1588       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1589       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1590       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1591       v += 49;
1592      }
1593     x[idx]   = s1;
1594     x[1+idx] = s2;
1595     x[2+idx] = s3;
1596     x[3+idx] = s4;
1597     x[4+idx] = s5;
1598     x[5+idx] = s6;
1599     x[6+idx] = s7;
1600   }
1601   /* backward solve the upper triangular */
1602   for (i=n-1; i>=0; i--){
1603     v    = aa + 49*diag[i] + 49;
1604     vi   = aj + diag[i] + 1;
1605     nz   = ai[i+1] - diag[i] - 1;
1606     idt  = 7*i;
1607     s1 = x[idt];   s2 = x[1+idt];
1608     s3 = x[2+idt]; s4 = x[3+idt];
1609     s5 = x[4+idt]; s6 = x[5+idt];
1610     s7 = x[6+idt];
1611     while (nz--) {
1612       idx   = 7*(*vi++);
1613       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1614       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1615       x7    = x[6+idx];
1616       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1617       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1618       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1619       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1620       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1621       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1622       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1623       v += 49;
1624     }
1625     v        = aa + 49*diag[i];
1626     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1627                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1628     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1629                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1630     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1631                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1632     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1633                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1634     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1635                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1636     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1637                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1638     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1639                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1640   }
1641 
1642   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1643   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1644   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 #undef __FUNCT__
1649 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1650 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1651 {
1652     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1653     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1654     PetscErrorCode    ierr;
1655     PetscInt          idx,jdx,idt;
1656     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1657     const MatScalar   *aa=a->a,*v;
1658     PetscScalar       *x;
1659     const PetscScalar *b;
1660     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1661 
1662     PetscFunctionBegin;
1663     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1664     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1665     /* forward solve the lower triangular */
1666     idx    = 0;
1667     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1668     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1669     for (i=1; i<n; i++) {
1670        v    = aa + bs2*ai[i];
1671        vi   = aj + ai[i];
1672        nz   = ai[i+1] - ai[i];
1673       idx   = bs*i;
1674        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1675        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1676        for(k=0;k<nz;k++) {
1677           jdx   = bs*vi[k];
1678           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1679 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1680           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1681           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1682           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1683 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1684           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1685 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1686 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1687           v   +=  bs2;
1688         }
1689 
1690        x[idx]   = s1;
1691        x[1+idx] = s2;
1692        x[2+idx] = s3;
1693        x[3+idx] = s4;
1694        x[4+idx] = s5;
1695        x[5+idx] = s6;
1696        x[6+idx] = s7;
1697     }
1698 
1699    /* backward solve the upper triangular */
1700   for (i=n-1; i>=0; i--){
1701      v   = aa + bs2*ai[2*n-i];
1702      vi  = aj + ai[2*n-i];
1703      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1704      idt = bs*i;
1705      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1706      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1707     for(k=0;k<nz;k++) {
1708       idx   = bs*vi[k];
1709        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1710        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1711        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1712        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1713        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1714        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1715        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1716        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1717        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1718         v   +=  bs2;
1719     }
1720     /* x = inv_diagonal*x */
1721     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1722     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1723     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1724     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1725     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1726     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1727     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1728   }
1729 
1730   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1731   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1732   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 #undef __FUNCT__
1737 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
1738 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1739 {
1740     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1741     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1742     PetscErrorCode    ierr;
1743     PetscInt          idx,jdx,idt;
1744     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1745     const MatScalar   *aa=a->a,*v;
1746     PetscScalar       *x;
1747     const PetscScalar *b;
1748     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1749 
1750     PetscFunctionBegin;
1751     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1752     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1753     /* forward solve the lower triangular */
1754     idx    = 0;
1755     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1756     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1757     for (i=1; i<n; i++) {
1758        v    = aa + bs2*ai[i];
1759        vi   = aj + ai[i];
1760        nz   = ai[i+1] - ai[i];
1761       idx   = bs*i;
1762        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1763        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1764        for(k=0;k<nz;k++) {
1765           jdx   = bs*vi[k];
1766           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1767 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1768           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1769           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1770           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1771 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1772           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1773 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1774 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1775           v   +=  bs2;
1776         }
1777 
1778        x[idx]   = s1;
1779        x[1+idx] = s2;
1780        x[2+idx] = s3;
1781        x[3+idx] = s4;
1782        x[4+idx] = s5;
1783        x[5+idx] = s6;
1784        x[6+idx] = s7;
1785     }
1786 
1787    /* backward solve the upper triangular */
1788   for (i=n-1; i>=0; i--){
1789     v   = aa + bs2*(adiag[i+1]+1);
1790      vi  = aj + adiag[i+1]+1;
1791      nz  = adiag[i] - adiag[i+1]-1;
1792      idt = bs*i;
1793      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1794      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1795     for(k=0;k<nz;k++) {
1796       idx   = bs*vi[k];
1797        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1798        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1799        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1800        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1801        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1802        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1803        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1804        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1805        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1806         v   +=  bs2;
1807     }
1808     /* x = inv_diagonal*x */
1809     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1810     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1811     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1812     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1813     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1814     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1815     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1816   }
1817 
1818   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1819   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1820   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 #undef __FUNCT__
1825 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1826 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1827 {
1828   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1829   IS                iscol=a->col,isrow=a->row;
1830   PetscErrorCode    ierr;
1831   const PetscInt    *r,*c,*rout,*cout;
1832   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1833   const MatScalar   *aa=a->a,*v;
1834   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1835   const PetscScalar *b;
1836   PetscFunctionBegin;
1837   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1838   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1839   t  = a->solve_work;
1840 
1841   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1842   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1843 
1844   /* forward solve the lower triangular */
1845   idx    = 6*(*r++);
1846   t[0] = b[idx];   t[1] = b[1+idx];
1847   t[2] = b[2+idx]; t[3] = b[3+idx];
1848   t[4] = b[4+idx]; t[5] = b[5+idx];
1849   for (i=1; i<n; i++) {
1850     v     = aa + 36*ai[i];
1851     vi    = aj + ai[i];
1852     nz    = diag[i] - ai[i];
1853     idx   = 6*(*r++);
1854     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1855     s5  = b[4+idx]; s6 = b[5+idx];
1856     while (nz--) {
1857       idx   = 6*(*vi++);
1858       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1859       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1860       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1861       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1862       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1863       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1864       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1865       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1866       v += 36;
1867     }
1868     idx = 6*i;
1869     t[idx]   = s1;t[1+idx] = s2;
1870     t[2+idx] = s3;t[3+idx] = s4;
1871     t[4+idx] = s5;t[5+idx] = s6;
1872   }
1873   /* backward solve the upper triangular */
1874   for (i=n-1; i>=0; i--){
1875     v    = aa + 36*diag[i] + 36;
1876     vi   = aj + diag[i] + 1;
1877     nz   = ai[i+1] - diag[i] - 1;
1878     idt  = 6*i;
1879     s1 = t[idt];  s2 = t[1+idt];
1880     s3 = t[2+idt];s4 = t[3+idt];
1881     s5 = t[4+idt];s6 = t[5+idt];
1882     while (nz--) {
1883       idx   = 6*(*vi++);
1884       x1    = t[idx];   x2 = t[1+idx];
1885       x3    = t[2+idx]; x4 = t[3+idx];
1886       x5    = t[4+idx]; x6 = t[5+idx];
1887       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1888       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1889       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1890       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1891       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1892       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1893       v += 36;
1894     }
1895     idc = 6*(*c--);
1896     v   = aa + 36*diag[i];
1897     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1898                                  v[18]*s4+v[24]*s5+v[30]*s6;
1899     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1900                                  v[19]*s4+v[25]*s5+v[31]*s6;
1901     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1902                                  v[20]*s4+v[26]*s5+v[32]*s6;
1903     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1904                                  v[21]*s4+v[27]*s5+v[33]*s6;
1905     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1906                                  v[22]*s4+v[28]*s5+v[34]*s6;
1907     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1908                                  v[23]*s4+v[29]*s5+v[35]*s6;
1909   }
1910 
1911   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1912   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1913   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1914   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1915   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 #undef __FUNCT__
1920 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1921 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1922 {
1923   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1924   IS                iscol=a->col,isrow=a->row;
1925   PetscErrorCode    ierr;
1926   const PetscInt    *r,*c,*rout,*cout;
1927   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1928   const MatScalar   *aa=a->a,*v;
1929   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1930   const PetscScalar *b;
1931   PetscFunctionBegin;
1932   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1933   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1934   t  = a->solve_work;
1935 
1936   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1937   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1938 
1939   /* forward solve the lower triangular */
1940   idx    = 6*r[0];
1941   t[0] = b[idx];   t[1] = b[1+idx];
1942   t[2] = b[2+idx]; t[3] = b[3+idx];
1943   t[4] = b[4+idx]; t[5] = b[5+idx];
1944   for (i=1; i<n; i++) {
1945     v     = aa + 36*ai[i];
1946     vi    = aj + ai[i];
1947     nz    = ai[i+1] - ai[i];
1948     idx   = 6*r[i];
1949     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1950     s5  = b[4+idx]; s6 = b[5+idx];
1951     for(m=0;m<nz;m++){
1952       idx   = 6*vi[m];
1953       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1954       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1955       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1956       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1957       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1958       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1959       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1960       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1961       v += 36;
1962     }
1963     idx = 6*i;
1964     t[idx]   = s1;t[1+idx] = s2;
1965     t[2+idx] = s3;t[3+idx] = s4;
1966     t[4+idx] = s5;t[5+idx] = s6;
1967   }
1968   /* backward solve the upper triangular */
1969   for (i=n-1; i>=0; i--){
1970     k    = 2*n-i;
1971     v    = aa + 36*ai[k];
1972     vi   = aj + ai[k];
1973     nz   = ai[k+1] - ai[k] - 1;
1974     idt  = 6*i;
1975     s1 = t[idt];  s2 = t[1+idt];
1976     s3 = t[2+idt];s4 = t[3+idt];
1977     s5 = t[4+idt];s6 = t[5+idt];
1978     for(m=0;m<nz;m++){
1979       idx   = 6*vi[m];
1980       x1    = t[idx];   x2 = t[1+idx];
1981       x3    = t[2+idx]; x4 = t[3+idx];
1982       x5    = t[4+idx]; x6 = t[5+idx];
1983       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1984       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1985       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1986       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1987       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1988       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1989       v += 36;
1990     }
1991     idc = 6*c[i];
1992     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1993                                  v[18]*s4+v[24]*s5+v[30]*s6;
1994     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1995                                  v[19]*s4+v[25]*s5+v[31]*s6;
1996     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1997                                  v[20]*s4+v[26]*s5+v[32]*s6;
1998     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1999                                  v[21]*s4+v[27]*s5+v[33]*s6;
2000     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2001                                  v[22]*s4+v[28]*s5+v[34]*s6;
2002     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2003                                  v[23]*s4+v[29]*s5+v[35]*s6;
2004   }
2005 
2006   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2007   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2008   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2009   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2010   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2011   PetscFunctionReturn(0);
2012 }
2013 
2014 #undef __FUNCT__
2015 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2"
2016 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2017 {
2018   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2019   IS                iscol=a->col,isrow=a->row;
2020   PetscErrorCode    ierr;
2021   const PetscInt    *r,*c,*rout,*cout;
2022   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2023   const MatScalar   *aa=a->a,*v;
2024   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2025   const PetscScalar *b;
2026   PetscFunctionBegin;
2027   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2028   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2029   t  = a->solve_work;
2030 
2031   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2032   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2033 
2034   /* forward solve the lower triangular */
2035   idx    = 6*r[0];
2036   t[0] = b[idx];   t[1] = b[1+idx];
2037   t[2] = b[2+idx]; t[3] = b[3+idx];
2038   t[4] = b[4+idx]; t[5] = b[5+idx];
2039   for (i=1; i<n; i++) {
2040     v     = aa + 36*ai[i];
2041     vi    = aj + ai[i];
2042     nz    = ai[i+1] - ai[i];
2043     idx   = 6*r[i];
2044     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2045     s5  = b[4+idx]; s6 = b[5+idx];
2046     for(m=0;m<nz;m++){
2047       idx   = 6*vi[m];
2048       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2049       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2050       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2051       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2052       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2053       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2054       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2055       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2056       v += 36;
2057     }
2058     idx = 6*i;
2059     t[idx]   = s1;t[1+idx] = s2;
2060     t[2+idx] = s3;t[3+idx] = s4;
2061     t[4+idx] = s5;t[5+idx] = s6;
2062   }
2063   /* backward solve the upper triangular */
2064   for (i=n-1; i>=0; i--){
2065     v    = aa + 36*(adiag[i+1]+1);
2066     vi   = aj + adiag[i+1]+1;
2067     nz   = adiag[i] - adiag[i+1] - 1;
2068     idt  = 6*i;
2069     s1 = t[idt];  s2 = t[1+idt];
2070     s3 = t[2+idt];s4 = t[3+idt];
2071     s5 = t[4+idt];s6 = t[5+idt];
2072     for(m=0;m<nz;m++){
2073       idx   = 6*vi[m];
2074       x1    = t[idx];   x2 = t[1+idx];
2075       x3    = t[2+idx]; x4 = t[3+idx];
2076       x5    = t[4+idx]; x6 = t[5+idx];
2077       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2078       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2079       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2080       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2081       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2082       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2083       v += 36;
2084     }
2085     idc = 6*c[i];
2086     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2087                                  v[18]*s4+v[24]*s5+v[30]*s6;
2088     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2089                                  v[19]*s4+v[25]*s5+v[31]*s6;
2090     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2091                                  v[20]*s4+v[26]*s5+v[32]*s6;
2092     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2093                                  v[21]*s4+v[27]*s5+v[33]*s6;
2094     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2095                                  v[22]*s4+v[28]*s5+v[34]*s6;
2096     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2097                                  v[23]*s4+v[29]*s5+v[35]*s6;
2098   }
2099 
2100   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2101   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2102   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2103   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2104   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2105   PetscFunctionReturn(0);
2106 }
2107 
2108 #undef __FUNCT__
2109 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2110 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
2111 {
2112   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2113   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2114   PetscErrorCode    ierr;
2115   PetscInt          *diag = a->diag,jdx;
2116   const MatScalar   *aa=a->a,*v;
2117   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2118   const PetscScalar *b;
2119 
2120   PetscFunctionBegin;
2121   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2122   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2123   /* forward solve the lower triangular */
2124   idx    = 0;
2125   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2126   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2127   for (i=1; i<n; i++) {
2128     v     =  aa + 36*ai[i];
2129     vi    =  aj + ai[i];
2130     nz    =  diag[i] - ai[i];
2131     idx   =  6*i;
2132     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2133     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2134     while (nz--) {
2135       jdx   = 6*(*vi++);
2136       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2137       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2138       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2139       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2140       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2141       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2142       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2143       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2144       v += 36;
2145      }
2146     x[idx]   = s1;
2147     x[1+idx] = s2;
2148     x[2+idx] = s3;
2149     x[3+idx] = s4;
2150     x[4+idx] = s5;
2151     x[5+idx] = s6;
2152   }
2153   /* backward solve the upper triangular */
2154   for (i=n-1; i>=0; i--){
2155     v    = aa + 36*diag[i] + 36;
2156     vi   = aj + diag[i] + 1;
2157     nz   = ai[i+1] - diag[i] - 1;
2158     idt  = 6*i;
2159     s1 = x[idt];   s2 = x[1+idt];
2160     s3 = x[2+idt]; s4 = x[3+idt];
2161     s5 = x[4+idt]; s6 = x[5+idt];
2162     while (nz--) {
2163       idx   = 6*(*vi++);
2164       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2165       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2166       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2167       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2168       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2169       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2170       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2171       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2172       v += 36;
2173     }
2174     v        = aa + 36*diag[i];
2175     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2176     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2177     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2178     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2179     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2180     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2181   }
2182 
2183   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2184   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2185   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2186   PetscFunctionReturn(0);
2187 }
2188 
2189 #undef __FUNCT__
2190 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2191 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2192 {
2193     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2194     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2195     PetscErrorCode    ierr;
2196     PetscInt          idx,jdx,idt;
2197     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2198     const MatScalar   *aa=a->a,*v;
2199     PetscScalar       *x;
2200     const PetscScalar *b;
2201     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2202 
2203     PetscFunctionBegin;
2204     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2205     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2206     /* forward solve the lower triangular */
2207     idx    = 0;
2208     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2209     x[4] = b[4+idx];x[5] = b[5+idx];
2210     for (i=1; i<n; i++) {
2211        v    = aa + bs2*ai[i];
2212        vi   = aj + ai[i];
2213        nz   = ai[i+1] - ai[i];
2214       idx   = bs*i;
2215        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2216        s5   = b[4+idx];s6 = b[5+idx];
2217        for(k=0;k<nz;k++){
2218           jdx   = bs*vi[k];
2219           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2220 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2221           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2222           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2223           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2224 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2225           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2226 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2227           v   +=  bs2;
2228         }
2229 
2230        x[idx]   = s1;
2231        x[1+idx] = s2;
2232        x[2+idx] = s3;
2233        x[3+idx] = s4;
2234        x[4+idx] = s5;
2235        x[5+idx] = s6;
2236     }
2237 
2238    /* backward solve the upper triangular */
2239   for (i=n-1; i>=0; i--){
2240      v   = aa + bs2*ai[2*n-i];
2241      vi  = aj + ai[2*n-i];
2242      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2243      idt = bs*i;
2244      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2245      s5 = x[4+idt];s6 = x[5+idt];
2246      for(k=0;k<nz;k++){
2247       idx   = bs*vi[k];
2248        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2249        x5    = x[4+idx];x6 = x[5+idx];
2250        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2251        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2252        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2253        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2254        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2255        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2256         v   +=  bs2;
2257     }
2258     /* x = inv_diagonal*x */
2259    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2260    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2261    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2262    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2263    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2264    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2265   }
2266 
2267   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2268   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2269   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2270   PetscFunctionReturn(0);
2271 }
2272 
2273 #undef __FUNCT__
2274 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
2275 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2276 {
2277     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2278     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2279     PetscErrorCode    ierr;
2280     PetscInt          idx,jdx,idt;
2281     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2282     const MatScalar   *aa=a->a,*v;
2283     PetscScalar       *x;
2284     const PetscScalar *b;
2285     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2286 
2287     PetscFunctionBegin;
2288     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2289     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2290     /* forward solve the lower triangular */
2291     idx    = 0;
2292     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2293     x[4] = b[4+idx];x[5] = b[5+idx];
2294     for (i=1; i<n; i++) {
2295        v    = aa + bs2*ai[i];
2296        vi   = aj + ai[i];
2297        nz   = ai[i+1] - ai[i];
2298       idx   = bs*i;
2299        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2300        s5   = b[4+idx];s6 = b[5+idx];
2301        for(k=0;k<nz;k++){
2302           jdx   = bs*vi[k];
2303           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2304 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2305           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2306           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2307           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2308 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2309           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2310 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2311           v   +=  bs2;
2312         }
2313 
2314        x[idx]   = s1;
2315        x[1+idx] = s2;
2316        x[2+idx] = s3;
2317        x[3+idx] = s4;
2318        x[4+idx] = s5;
2319        x[5+idx] = s6;
2320     }
2321 
2322    /* backward solve the upper triangular */
2323   for (i=n-1; i>=0; i--){
2324     v   = aa + bs2*(adiag[i+1]+1);
2325      vi  = aj + adiag[i+1]+1;
2326      nz  = adiag[i] - adiag[i+1]-1;
2327      idt = bs*i;
2328      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2329      s5 = x[4+idt];s6 = x[5+idt];
2330      for(k=0;k<nz;k++){
2331       idx   = bs*vi[k];
2332        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2333        x5    = x[4+idx];x6 = x[5+idx];
2334        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2335        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2336        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2337        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2338        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2339        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2340         v   +=  bs2;
2341     }
2342     /* x = inv_diagonal*x */
2343    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2344    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2345    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2346    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2347    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2348    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2349   }
2350 
2351   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2352   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2353   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2354   PetscFunctionReturn(0);
2355 }
2356 
2357 #undef __FUNCT__
2358 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2359 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2360 {
2361   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2362   IS                iscol=a->col,isrow=a->row;
2363   PetscErrorCode    ierr;
2364   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2365   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2366   const MatScalar   *aa=a->a,*v;
2367   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2368   const PetscScalar *b;
2369 
2370   PetscFunctionBegin;
2371   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2372   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2373   t  = a->solve_work;
2374 
2375   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2376   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2377 
2378   /* forward solve the lower triangular */
2379   idx    = 5*(*r++);
2380   t[0] = b[idx];   t[1] = b[1+idx];
2381   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2382   for (i=1; i<n; i++) {
2383     v     = aa + 25*ai[i];
2384     vi    = aj + ai[i];
2385     nz    = diag[i] - ai[i];
2386     idx   = 5*(*r++);
2387     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2388     s5  = b[4+idx];
2389     while (nz--) {
2390       idx   = 5*(*vi++);
2391       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2392       x4    = t[3+idx];x5 = t[4+idx];
2393       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2394       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2395       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2396       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2397       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2398       v += 25;
2399     }
2400     idx = 5*i;
2401     t[idx]   = s1;t[1+idx] = s2;
2402     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2403   }
2404   /* backward solve the upper triangular */
2405   for (i=n-1; i>=0; i--){
2406     v    = aa + 25*diag[i] + 25;
2407     vi   = aj + diag[i] + 1;
2408     nz   = ai[i+1] - diag[i] - 1;
2409     idt  = 5*i;
2410     s1 = t[idt];  s2 = t[1+idt];
2411     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2412     while (nz--) {
2413       idx   = 5*(*vi++);
2414       x1    = t[idx];   x2 = t[1+idx];
2415       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2416       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2417       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2418       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2419       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2420       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2421       v += 25;
2422     }
2423     idc = 5*(*c--);
2424     v   = aa + 25*diag[i];
2425     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2426                                  v[15]*s4+v[20]*s5;
2427     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2428                                  v[16]*s4+v[21]*s5;
2429     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2430                                  v[17]*s4+v[22]*s5;
2431     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2432                                  v[18]*s4+v[23]*s5;
2433     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2434                                  v[19]*s4+v[24]*s5;
2435   }
2436 
2437   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2438   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2439   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2440   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2441   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2442   PetscFunctionReturn(0);
2443 }
2444 
2445 #undef __FUNCT__
2446 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2447 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2448 {
2449   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2450   IS                iscol=a->col,isrow=a->row;
2451   PetscErrorCode    ierr;
2452   const PetscInt    *r,*c,*rout,*cout;
2453   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2454   const MatScalar   *aa=a->a,*v;
2455   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2456   const PetscScalar *b;
2457 
2458   PetscFunctionBegin;
2459   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2461   t  = a->solve_work;
2462 
2463   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2464   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2465 
2466   /* forward solve the lower triangular */
2467   idx    = 5*r[0];
2468   t[0] = b[idx];   t[1] = b[1+idx];
2469   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2470   for (i=1; i<n; i++) {
2471     v     = aa + 25*ai[i];
2472     vi    = aj + ai[i];
2473     nz    = ai[i+1] - ai[i];
2474     idx   = 5*r[i];
2475     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2476     s5  = b[4+idx];
2477     for(m=0;m<nz;m++){
2478       idx   = 5*vi[m];
2479       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2480       x4    = t[3+idx];x5 = t[4+idx];
2481       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2482       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2483       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2484       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2485       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2486       v += 25;
2487     }
2488     idx = 5*i;
2489     t[idx]   = s1;t[1+idx] = s2;
2490     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2491   }
2492   /* backward solve the upper triangular */
2493   for (i=n-1; i>=0; i--){
2494     k    = 2*n-i;
2495     v    = aa + 25*ai[k];
2496     vi   = aj + ai[k];
2497     nz   = ai[k+1] - ai[k] - 1;
2498     idt  = 5*i;
2499     s1 = t[idt];  s2 = t[1+idt];
2500     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2501     for(m=0;m<nz;m++){
2502       idx   = 5*vi[m];
2503       x1    = t[idx];   x2 = t[1+idx];
2504       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2505       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2506       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2507       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2508       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2509       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2510       v += 25;
2511     }
2512     idc = 5*c[i];
2513     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2514                                  v[15]*s4+v[20]*s5;
2515     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2516                                  v[16]*s4+v[21]*s5;
2517     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2518                                  v[17]*s4+v[22]*s5;
2519     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2520                                  v[18]*s4+v[23]*s5;
2521     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2522                                  v[19]*s4+v[24]*s5;
2523   }
2524 
2525   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2526   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2527   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2528   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2529   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2530   PetscFunctionReturn(0);
2531 }
2532 
2533 #undef __FUNCT__
2534 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
2535 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2536 {
2537   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2538   IS                iscol=a->col,isrow=a->row;
2539   PetscErrorCode    ierr;
2540   const PetscInt    *r,*c,*rout,*cout;
2541   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2542   const MatScalar   *aa=a->a,*v;
2543   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2544   const PetscScalar *b;
2545 
2546   PetscFunctionBegin;
2547   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2548   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2549   t  = a->solve_work;
2550 
2551   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2552   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2553 
2554   /* forward solve the lower triangular */
2555   idx    = 5*r[0];
2556   t[0] = b[idx];   t[1] = b[1+idx];
2557   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2558   for (i=1; i<n; i++) {
2559     v     = aa + 25*ai[i];
2560     vi    = aj + ai[i];
2561     nz    = ai[i+1] - ai[i];
2562     idx   = 5*r[i];
2563     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2564     s5  = b[4+idx];
2565     for(m=0;m<nz;m++){
2566       idx   = 5*vi[m];
2567       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2568       x4    = t[3+idx];x5 = t[4+idx];
2569       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2570       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2571       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2572       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2573       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2574       v += 25;
2575     }
2576     idx = 5*i;
2577     t[idx]   = s1;t[1+idx] = s2;
2578     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2579   }
2580   /* backward solve the upper triangular */
2581   for (i=n-1; i>=0; i--){
2582     v    = aa + 25*(adiag[i+1]+1);
2583     vi   = aj + adiag[i+1]+1;
2584     nz   = adiag[i] - adiag[i+1] - 1;
2585     idt  = 5*i;
2586     s1 = t[idt];  s2 = t[1+idt];
2587     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2588     for(m=0;m<nz;m++){
2589       idx   = 5*vi[m];
2590       x1    = t[idx];   x2 = t[1+idx];
2591       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2592       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2593       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2594       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2595       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2596       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2597       v += 25;
2598     }
2599     idc = 5*c[i];
2600     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2601                                  v[15]*s4+v[20]*s5;
2602     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2603                                  v[16]*s4+v[21]*s5;
2604     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2605                                  v[17]*s4+v[22]*s5;
2606     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2607                                  v[18]*s4+v[23]*s5;
2608     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2609                                  v[19]*s4+v[24]*s5;
2610   }
2611 
2612   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2613   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2614   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2615   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2616   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2617   PetscFunctionReturn(0);
2618 }
2619 
2620 #undef __FUNCT__
2621 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2622 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2623 {
2624   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2625   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2626   PetscErrorCode    ierr;
2627   PetscInt          *diag = a->diag,jdx;
2628   const MatScalar   *aa=a->a,*v;
2629   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2630   const PetscScalar *b;
2631 
2632   PetscFunctionBegin;
2633   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2634   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2635   /* forward solve the lower triangular */
2636   idx    = 0;
2637   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2638   for (i=1; i<n; i++) {
2639     v     =  aa + 25*ai[i];
2640     vi    =  aj + ai[i];
2641     nz    =  diag[i] - ai[i];
2642     idx   =  5*i;
2643     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2644     while (nz--) {
2645       jdx   = 5*(*vi++);
2646       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2647       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2648       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2649       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2650       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2651       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2652       v    += 25;
2653     }
2654     x[idx]   = s1;
2655     x[1+idx] = s2;
2656     x[2+idx] = s3;
2657     x[3+idx] = s4;
2658     x[4+idx] = s5;
2659   }
2660   /* backward solve the upper triangular */
2661   for (i=n-1; i>=0; i--){
2662     v    = aa + 25*diag[i] + 25;
2663     vi   = aj + diag[i] + 1;
2664     nz   = ai[i+1] - diag[i] - 1;
2665     idt  = 5*i;
2666     s1 = x[idt];  s2 = x[1+idt];
2667     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2668     while (nz--) {
2669       idx   = 5*(*vi++);
2670       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2671       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2672       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2673       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2674       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2675       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2676       v    += 25;
2677     }
2678     v        = aa + 25*diag[i];
2679     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2680     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2681     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2682     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2683     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2684   }
2685 
2686   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2688   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2689   PetscFunctionReturn(0);
2690 }
2691 
2692 #undef __FUNCT__
2693 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2694 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2695 {
2696   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2697   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2698   PetscErrorCode    ierr;
2699   PetscInt          jdx;
2700   const MatScalar   *aa=a->a,*v;
2701   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2702   const PetscScalar *b;
2703 
2704   PetscFunctionBegin;
2705   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2706   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2707   /* forward solve the lower triangular */
2708   idx    = 0;
2709   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2710   for (i=1; i<n; i++) {
2711     v   = aa + 25*ai[i];
2712     vi  = aj + ai[i];
2713     nz  = ai[i+1] - ai[i];
2714     idx = 5*i;
2715     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2716     for(k=0;k<nz;k++) {
2717       jdx   = 5*vi[k];
2718       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2719       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2720       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2721       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2722       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2723       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2724       v    += 25;
2725     }
2726     x[idx]   = s1;
2727     x[1+idx] = s2;
2728     x[2+idx] = s3;
2729     x[3+idx] = s4;
2730     x[4+idx] = s5;
2731   }
2732 
2733   /* backward solve the upper triangular */
2734   for (i=n-1; i>=0; i--){
2735     v   = aa + 25*ai[2*n-i];
2736     vi  = aj + ai[2*n-i];
2737     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2738     idt = 5*i;
2739     s1 = x[idt];  s2 = x[1+idt];
2740     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2741     for(k=0;k<nz;k++){
2742       idx   = 5*vi[k];
2743       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2744       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2745       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2746       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2747       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2748       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2749       v    += 25;
2750     }
2751     /* x = inv_diagonal*x */
2752     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2753     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2754     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2755     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2756     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2757   }
2758 
2759   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2760   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2761   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2762   PetscFunctionReturn(0);
2763 }
2764 
2765 #undef __FUNCT__
2766 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
2767 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2768 {
2769   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2770   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2771   PetscErrorCode    ierr;
2772   PetscInt          jdx;
2773   const MatScalar   *aa=a->a,*v;
2774   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2775   const PetscScalar *b;
2776 
2777   PetscFunctionBegin;
2778   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2779   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2780   /* forward solve the lower triangular */
2781   idx    = 0;
2782   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2783   for (i=1; i<n; i++) {
2784     v   = aa + 25*ai[i];
2785     vi  = aj + ai[i];
2786     nz  = ai[i+1] - ai[i];
2787     idx = 5*i;
2788     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2789     for(k=0;k<nz;k++) {
2790       jdx   = 5*vi[k];
2791       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2792       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2793       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2794       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2795       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2796       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2797       v    += 25;
2798     }
2799     x[idx]   = s1;
2800     x[1+idx] = s2;
2801     x[2+idx] = s3;
2802     x[3+idx] = s4;
2803     x[4+idx] = s5;
2804   }
2805 
2806   /* backward solve the upper triangular */
2807   for (i=n-1; i>=0; i--){
2808     v   = aa + 25*(adiag[i+1]+1);
2809     vi  = aj + adiag[i+1]+1;
2810     nz  = adiag[i] - adiag[i+1]-1;
2811     idt = 5*i;
2812     s1 = x[idt];  s2 = x[1+idt];
2813     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2814     for(k=0;k<nz;k++){
2815       idx   = 5*vi[k];
2816       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2817       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2818       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2819       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2820       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2821       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2822       v    += 25;
2823     }
2824     /* x = inv_diagonal*x */
2825     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2826     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2827     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2828     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2829     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2830   }
2831 
2832   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2833   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2834   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2835   PetscFunctionReturn(0);
2836 }
2837 
2838 #undef __FUNCT__
2839 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2840 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2841 {
2842   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2843   IS                iscol=a->col,isrow=a->row;
2844   PetscErrorCode    ierr;
2845   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2846   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2847   const MatScalar   *aa=a->a,*v;
2848   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2849   const PetscScalar *b;
2850 
2851   PetscFunctionBegin;
2852   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2853   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2854   t  = a->solve_work;
2855 
2856   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2857   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2858 
2859   /* forward solve the lower triangular */
2860   idx    = 4*(*r++);
2861   t[0] = b[idx];   t[1] = b[1+idx];
2862   t[2] = b[2+idx]; t[3] = b[3+idx];
2863   for (i=1; i<n; i++) {
2864     v     = aa + 16*ai[i];
2865     vi    = aj + ai[i];
2866     nz    = diag[i] - ai[i];
2867     idx   = 4*(*r++);
2868     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2869     while (nz--) {
2870       idx   = 4*(*vi++);
2871       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2872       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2873       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2874       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2875       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2876       v    += 16;
2877     }
2878     idx        = 4*i;
2879     t[idx]   = s1;t[1+idx] = s2;
2880     t[2+idx] = s3;t[3+idx] = s4;
2881   }
2882   /* backward solve the upper triangular */
2883   for (i=n-1; i>=0; i--){
2884     v    = aa + 16*diag[i] + 16;
2885     vi   = aj + diag[i] + 1;
2886     nz   = ai[i+1] - diag[i] - 1;
2887     idt  = 4*i;
2888     s1 = t[idt];  s2 = t[1+idt];
2889     s3 = t[2+idt];s4 = t[3+idt];
2890     while (nz--) {
2891       idx   = 4*(*vi++);
2892       x1    = t[idx];   x2 = t[1+idx];
2893       x3    = t[2+idx]; x4 = t[3+idx];
2894       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2895       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2896       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2897       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2898       v += 16;
2899     }
2900     idc      = 4*(*c--);
2901     v        = aa + 16*diag[i];
2902     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2903     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2904     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2905     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2906   }
2907 
2908   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2909   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2910   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2911   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2912   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2913   PetscFunctionReturn(0);
2914 }
2915 
2916 #undef __FUNCT__
2917 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2918 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2919 {
2920   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2921   IS                iscol=a->col,isrow=a->row;
2922   PetscErrorCode    ierr;
2923   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2924   const PetscInt    *r,*c,*rout,*cout;
2925   const MatScalar   *aa=a->a,*v;
2926   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2927   const PetscScalar *b;
2928 
2929   PetscFunctionBegin;
2930   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2931   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2932   t  = a->solve_work;
2933 
2934   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2935   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2936 
2937   /* forward solve the lower triangular */
2938   idx    = 4*r[0];
2939   t[0] = b[idx];   t[1] = b[1+idx];
2940   t[2] = b[2+idx]; t[3] = b[3+idx];
2941   for (i=1; i<n; i++) {
2942     v     = aa + 16*ai[i];
2943     vi    = aj + ai[i];
2944     nz    = ai[i+1] - ai[i];
2945     idx   = 4*r[i];
2946     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2947     for(m=0;m<nz;m++){
2948       idx   = 4*vi[m];
2949       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2950       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2951       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2952       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2953       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2954       v    += 16;
2955     }
2956     idx        = 4*i;
2957     t[idx]   = s1;t[1+idx] = s2;
2958     t[2+idx] = s3;t[3+idx] = s4;
2959   }
2960   /* backward solve the upper triangular */
2961   for (i=n-1; i>=0; i--){
2962     k    = 2*n-i;
2963     v    = aa + 16*ai[k];
2964     vi   = aj + ai[k];
2965     nz   = ai[k+1] - ai[k] - 1;
2966     idt  = 4*i;
2967     s1 = t[idt];  s2 = t[1+idt];
2968     s3 = t[2+idt];s4 = t[3+idt];
2969     for(m=0;m<nz;m++){
2970       idx   = 4*vi[m];
2971       x1    = t[idx];   x2 = t[1+idx];
2972       x3    = t[2+idx]; x4 = t[3+idx];
2973       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2974       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2975       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2976       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2977       v += 16;
2978     }
2979     idc      = 4*c[i];
2980     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2981     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2982     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2983     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2984   }
2985 
2986   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2987   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2988   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2989   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2990   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2991   PetscFunctionReturn(0);
2992 }
2993 
2994 #undef __FUNCT__
2995 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
2996 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2997 {
2998   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2999   IS                iscol=a->col,isrow=a->row;
3000   PetscErrorCode    ierr;
3001   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3002   const PetscInt    *r,*c,*rout,*cout;
3003   const MatScalar   *aa=a->a,*v;
3004   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3005   const PetscScalar *b;
3006 
3007   PetscFunctionBegin;
3008   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3009   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3010   t  = a->solve_work;
3011 
3012   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3013   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3014 
3015   /* forward solve the lower triangular */
3016   idx    = 4*r[0];
3017   t[0] = b[idx];   t[1] = b[1+idx];
3018   t[2] = b[2+idx]; t[3] = b[3+idx];
3019   for (i=1; i<n; i++) {
3020     v     = aa + 16*ai[i];
3021     vi    = aj + ai[i];
3022     nz    = ai[i+1] - ai[i];
3023     idx   = 4*r[i];
3024     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3025     for(m=0;m<nz;m++){
3026       idx   = 4*vi[m];
3027       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3028       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3029       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3030       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3031       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3032       v    += 16;
3033     }
3034     idx        = 4*i;
3035     t[idx]   = s1;t[1+idx] = s2;
3036     t[2+idx] = s3;t[3+idx] = s4;
3037   }
3038   /* backward solve the upper triangular */
3039   for (i=n-1; i>=0; i--){
3040     v    = aa + 16*(adiag[i+1]+1);
3041     vi   = aj + adiag[i+1]+1;
3042     nz   = adiag[i] - adiag[i+1] - 1;
3043     idt  = 4*i;
3044     s1 = t[idt];  s2 = t[1+idt];
3045     s3 = t[2+idt];s4 = t[3+idt];
3046     for(m=0;m<nz;m++){
3047       idx   = 4*vi[m];
3048       x1    = t[idx];   x2 = t[1+idx];
3049       x3    = t[2+idx]; x4 = t[3+idx];
3050       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3051       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3052       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3053       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3054       v += 16;
3055     }
3056     idc      = 4*c[i];
3057     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3058     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3059     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3060     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3061   }
3062 
3063   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3064   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3065   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3067   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3068   PetscFunctionReturn(0);
3069 }
3070 
3071 #undef __FUNCT__
3072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3073 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3074 {
3075   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3076   IS                iscol=a->col,isrow=a->row;
3077   PetscErrorCode    ierr;
3078   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3079   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3080   const MatScalar   *aa=a->a,*v;
3081   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3082   PetscScalar       *x;
3083   const PetscScalar *b;
3084 
3085   PetscFunctionBegin;
3086   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3087   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3088   t  = (MatScalar *)a->solve_work;
3089 
3090   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3091   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3092 
3093   /* forward solve the lower triangular */
3094   idx    = 4*(*r++);
3095   t[0] = (MatScalar)b[idx];
3096   t[1] = (MatScalar)b[1+idx];
3097   t[2] = (MatScalar)b[2+idx];
3098   t[3] = (MatScalar)b[3+idx];
3099   for (i=1; i<n; i++) {
3100     v     = aa + 16*ai[i];
3101     vi    = aj + ai[i];
3102     nz    = diag[i] - ai[i];
3103     idx   = 4*(*r++);
3104     s1 = (MatScalar)b[idx];
3105     s2 = (MatScalar)b[1+idx];
3106     s3 = (MatScalar)b[2+idx];
3107     s4 = (MatScalar)b[3+idx];
3108     while (nz--) {
3109       idx   = 4*(*vi++);
3110       x1  = t[idx];
3111       x2  = t[1+idx];
3112       x3  = t[2+idx];
3113       x4  = t[3+idx];
3114       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3115       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3116       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3117       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3118       v    += 16;
3119     }
3120     idx        = 4*i;
3121     t[idx]   = s1;
3122     t[1+idx] = s2;
3123     t[2+idx] = s3;
3124     t[3+idx] = s4;
3125   }
3126   /* backward solve the upper triangular */
3127   for (i=n-1; i>=0; i--){
3128     v    = aa + 16*diag[i] + 16;
3129     vi   = aj + diag[i] + 1;
3130     nz   = ai[i+1] - diag[i] - 1;
3131     idt  = 4*i;
3132     s1 = t[idt];
3133     s2 = t[1+idt];
3134     s3 = t[2+idt];
3135     s4 = t[3+idt];
3136     while (nz--) {
3137       idx   = 4*(*vi++);
3138       x1  = t[idx];
3139       x2  = t[1+idx];
3140       x3  = t[2+idx];
3141       x4  = t[3+idx];
3142       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3143       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3144       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3145       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3146       v += 16;
3147     }
3148     idc      = 4*(*c--);
3149     v        = aa + 16*diag[i];
3150     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3151     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3152     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3153     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3154     x[idc]   = (PetscScalar)t[idt];
3155     x[1+idc] = (PetscScalar)t[1+idt];
3156     x[2+idc] = (PetscScalar)t[2+idt];
3157     x[3+idc] = (PetscScalar)t[3+idt];
3158  }
3159 
3160   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3161   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3162   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3164   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3165   PetscFunctionReturn(0);
3166 }
3167 
3168 #if defined (PETSC_HAVE_SSE)
3169 
3170 #include PETSC_HAVE_SSE
3171 
3172 #undef __FUNCT__
3173 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3174 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3175 {
3176   /*
3177      Note: This code uses demotion of double
3178      to float when performing the mixed-mode computation.
3179      This may not be numerically reasonable for all applications.
3180   */
3181   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3182   IS             iscol=a->col,isrow=a->row;
3183   PetscErrorCode ierr;
3184   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3185   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3186   MatScalar      *aa=a->a,*v;
3187   PetscScalar    *x,*b,*t;
3188 
3189   /* Make space in temp stack for 16 Byte Aligned arrays */
3190   float           ssealignedspace[11],*tmps,*tmpx;
3191   unsigned long   offset;
3192 
3193   PetscFunctionBegin;
3194   SSE_SCOPE_BEGIN;
3195 
3196     offset = (unsigned long)ssealignedspace % 16;
3197     if (offset) offset = (16 - offset)/4;
3198     tmps = &ssealignedspace[offset];
3199     tmpx = &ssealignedspace[offset+4];
3200     PREFETCH_NTA(aa+16*ai[1]);
3201 
3202     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3203     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3204     t  = a->solve_work;
3205 
3206     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3207     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3208 
3209     /* forward solve the lower triangular */
3210     idx  = 4*(*r++);
3211     t[0] = b[idx];   t[1] = b[1+idx];
3212     t[2] = b[2+idx]; t[3] = b[3+idx];
3213     v    =  aa + 16*ai[1];
3214 
3215     for (i=1; i<n;) {
3216       PREFETCH_NTA(&v[8]);
3217       vi   =  aj      + ai[i];
3218       nz   =  diag[i] - ai[i];
3219       idx  =  4*(*r++);
3220 
3221       /* Demote sum from double to float */
3222       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3223       LOAD_PS(tmps,XMM7);
3224 
3225       while (nz--) {
3226         PREFETCH_NTA(&v[16]);
3227         idx = 4*(*vi++);
3228 
3229         /* Demote solution (so far) from double to float */
3230         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3231 
3232         /* 4x4 Matrix-Vector product with negative accumulation: */
3233         SSE_INLINE_BEGIN_2(tmpx,v)
3234           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3235 
3236           /* First Column */
3237           SSE_COPY_PS(XMM0,XMM6)
3238           SSE_SHUFFLE(XMM0,XMM0,0x00)
3239           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3240           SSE_SUB_PS(XMM7,XMM0)
3241 
3242           /* Second Column */
3243           SSE_COPY_PS(XMM1,XMM6)
3244           SSE_SHUFFLE(XMM1,XMM1,0x55)
3245           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3246           SSE_SUB_PS(XMM7,XMM1)
3247 
3248           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3249 
3250           /* Third Column */
3251           SSE_COPY_PS(XMM2,XMM6)
3252           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3253           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3254           SSE_SUB_PS(XMM7,XMM2)
3255 
3256           /* Fourth Column */
3257           SSE_COPY_PS(XMM3,XMM6)
3258           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3259           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3260           SSE_SUB_PS(XMM7,XMM3)
3261         SSE_INLINE_END_2
3262 
3263         v  += 16;
3264       }
3265       idx = 4*i;
3266       v   = aa + 16*ai[++i];
3267       PREFETCH_NTA(v);
3268       STORE_PS(tmps,XMM7);
3269 
3270       /* Promote result from float to double */
3271       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3272     }
3273     /* backward solve the upper triangular */
3274     idt  = 4*(n-1);
3275     ai16 = 16*diag[n-1];
3276     v    = aa + ai16 + 16;
3277     for (i=n-1; i>=0;){
3278       PREFETCH_NTA(&v[8]);
3279       vi = aj + diag[i] + 1;
3280       nz = ai[i+1] - diag[i] - 1;
3281 
3282       /* Demote accumulator from double to float */
3283       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3284       LOAD_PS(tmps,XMM7);
3285 
3286       while (nz--) {
3287         PREFETCH_NTA(&v[16]);
3288         idx = 4*(*vi++);
3289 
3290         /* Demote solution (so far) from double to float */
3291         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3292 
3293         /* 4x4 Matrix-Vector Product with negative accumulation: */
3294         SSE_INLINE_BEGIN_2(tmpx,v)
3295           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3296 
3297           /* First Column */
3298           SSE_COPY_PS(XMM0,XMM6)
3299           SSE_SHUFFLE(XMM0,XMM0,0x00)
3300           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3301           SSE_SUB_PS(XMM7,XMM0)
3302 
3303           /* Second Column */
3304           SSE_COPY_PS(XMM1,XMM6)
3305           SSE_SHUFFLE(XMM1,XMM1,0x55)
3306           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3307           SSE_SUB_PS(XMM7,XMM1)
3308 
3309           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3310 
3311           /* Third Column */
3312           SSE_COPY_PS(XMM2,XMM6)
3313           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3314           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3315           SSE_SUB_PS(XMM7,XMM2)
3316 
3317           /* Fourth Column */
3318           SSE_COPY_PS(XMM3,XMM6)
3319           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3320           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3321           SSE_SUB_PS(XMM7,XMM3)
3322         SSE_INLINE_END_2
3323         v  += 16;
3324       }
3325       v    = aa + ai16;
3326       ai16 = 16*diag[--i];
3327       PREFETCH_NTA(aa+ai16+16);
3328       /*
3329          Scale the result by the diagonal 4x4 block,
3330          which was inverted as part of the factorization
3331       */
3332       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3333         /* First Column */
3334         SSE_COPY_PS(XMM0,XMM7)
3335         SSE_SHUFFLE(XMM0,XMM0,0x00)
3336         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3337 
3338         /* Second Column */
3339         SSE_COPY_PS(XMM1,XMM7)
3340         SSE_SHUFFLE(XMM1,XMM1,0x55)
3341         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3342         SSE_ADD_PS(XMM0,XMM1)
3343 
3344         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3345 
3346         /* Third Column */
3347         SSE_COPY_PS(XMM2,XMM7)
3348         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3349         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3350         SSE_ADD_PS(XMM0,XMM2)
3351 
3352         /* Fourth Column */
3353         SSE_COPY_PS(XMM3,XMM7)
3354         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3355         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3356         SSE_ADD_PS(XMM0,XMM3)
3357 
3358         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3359       SSE_INLINE_END_3
3360 
3361       /* Promote solution from float to double */
3362       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3363 
3364       /* Apply reordering to t and stream into x.    */
3365       /* This way, x doesn't pollute the cache.      */
3366       /* Be careful with size: 2 doubles = 4 floats! */
3367       idc  = 4*(*c--);
3368       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3369         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3370         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3371         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3372         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3373         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3374         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3375       SSE_INLINE_END_2
3376       v    = aa + ai16 + 16;
3377       idt -= 4;
3378     }
3379 
3380     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3381     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3382     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3383     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3384     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3385   SSE_SCOPE_END;
3386   PetscFunctionReturn(0);
3387 }
3388 
3389 #endif
3390 
3391 
3392 /*
3393       Special case where the matrix was ILU(0) factored in the natural
3394    ordering. This eliminates the need for the column and row permutation.
3395 */
3396 #undef __FUNCT__
3397 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3398 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3399 {
3400   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3401   PetscInt          n=a->mbs;
3402   const PetscInt    *ai=a->i,*aj=a->j;
3403   PetscErrorCode    ierr;
3404   const PetscInt    *diag = a->diag;
3405   const MatScalar   *aa=a->a;
3406   PetscScalar       *x;
3407   const PetscScalar *b;
3408 
3409   PetscFunctionBegin;
3410   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3411   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3412 
3413 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3414   {
3415     static PetscScalar w[2000]; /* very BAD need to fix */
3416     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3417   }
3418 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3419   {
3420     static PetscScalar w[2000]; /* very BAD need to fix */
3421     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3422   }
3423 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3424   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3425 #else
3426   {
3427     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3428     const MatScalar *v;
3429     PetscInt        jdx,idt,idx,nz,i,ai16;
3430     const PetscInt  *vi;
3431 
3432   /* forward solve the lower triangular */
3433   idx    = 0;
3434   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3435   for (i=1; i<n; i++) {
3436     v     =  aa      + 16*ai[i];
3437     vi    =  aj      + ai[i];
3438     nz    =  diag[i] - ai[i];
3439     idx   +=  4;
3440     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3441     while (nz--) {
3442       jdx   = 4*(*vi++);
3443       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3444       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3445       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3446       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3447       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3448       v    += 16;
3449     }
3450     x[idx]   = s1;
3451     x[1+idx] = s2;
3452     x[2+idx] = s3;
3453     x[3+idx] = s4;
3454   }
3455   /* backward solve the upper triangular */
3456   idt = 4*(n-1);
3457   for (i=n-1; i>=0; i--){
3458     ai16 = 16*diag[i];
3459     v    = aa + ai16 + 16;
3460     vi   = aj + diag[i] + 1;
3461     nz   = ai[i+1] - diag[i] - 1;
3462     s1 = x[idt];  s2 = x[1+idt];
3463     s3 = x[2+idt];s4 = x[3+idt];
3464     while (nz--) {
3465       idx   = 4*(*vi++);
3466       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3467       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3468       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3469       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3470       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3471       v    += 16;
3472     }
3473     v        = aa + ai16;
3474     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3475     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3476     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3477     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3478     idt -= 4;
3479   }
3480   }
3481 #endif
3482 
3483   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3484   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3485   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3486   PetscFunctionReturn(0);
3487 }
3488 
3489 #undef __FUNCT__
3490 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3491 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3492 {
3493     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3494     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3495     PetscErrorCode    ierr;
3496     PetscInt          idx,jdx,idt;
3497     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3498     const MatScalar   *aa=a->a,*v;
3499     PetscScalar       *x;
3500     const PetscScalar *b;
3501     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3502 
3503     PetscFunctionBegin;
3504     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3505     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3506     /* forward solve the lower triangular */
3507     idx    = 0;
3508     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3509     for (i=1; i<n; i++) {
3510        v    = aa + bs2*ai[i];
3511        vi   = aj + ai[i];
3512        nz   = ai[i+1] - ai[i];
3513       idx   = bs*i;
3514        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3515       for(k=0;k<nz;k++) {
3516           jdx   = bs*vi[k];
3517           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3518           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3519           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3520           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3521 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3522 
3523           v   +=  bs2;
3524         }
3525 
3526        x[idx]   = s1;
3527        x[1+idx] = s2;
3528        x[2+idx] = s3;
3529        x[3+idx] = s4;
3530     }
3531 
3532    /* backward solve the upper triangular */
3533   for (i=n-1; i>=0; i--){
3534      v   = aa + bs2*ai[2*n-i];
3535      vi  = aj + ai[2*n-i];
3536      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3537      idt = bs*i;
3538      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3539 
3540     for(k=0;k<nz;k++){
3541       idx   = bs*vi[k];
3542        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3543        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3544        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3545        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3546        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3547 
3548         v   +=  bs2;
3549     }
3550     /* x = inv_diagonal*x */
3551    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3552    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3553    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3554    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3555 
3556   }
3557 
3558   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3559   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3560   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3561   PetscFunctionReturn(0);
3562 }
3563 
3564 #undef __FUNCT__
3565 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3566 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3567 {
3568     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3569     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3570     PetscErrorCode    ierr;
3571     PetscInt          idx,jdx,idt;
3572     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3573     const MatScalar   *aa=a->a,*v;
3574     PetscScalar       *x;
3575     const PetscScalar *b;
3576     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3577 
3578     PetscFunctionBegin;
3579     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3580     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3581     /* forward solve the lower triangular */
3582     idx    = 0;
3583     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3584     for (i=1; i<n; i++) {
3585        v    = aa + bs2*ai[i];
3586        vi   = aj + ai[i];
3587        nz   = ai[i+1] - ai[i];
3588       idx   = bs*i;
3589        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3590       for(k=0;k<nz;k++) {
3591           jdx   = bs*vi[k];
3592           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3593           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3594           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3595           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3596 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3597 
3598           v   +=  bs2;
3599         }
3600 
3601        x[idx]   = s1;
3602        x[1+idx] = s2;
3603        x[2+idx] = s3;
3604        x[3+idx] = s4;
3605     }
3606 
3607    /* backward solve the upper triangular */
3608   for (i=n-1; i>=0; i--){
3609     v   = aa + bs2*(adiag[i+1]+1);
3610      vi  = aj + adiag[i+1]+1;
3611      nz  = adiag[i] - adiag[i+1]-1;
3612      idt = bs*i;
3613      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3614 
3615     for(k=0;k<nz;k++){
3616       idx   = bs*vi[k];
3617        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3618        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3619        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3620        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3621        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3622 
3623         v   +=  bs2;
3624     }
3625     /* x = inv_diagonal*x */
3626    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3627    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3628    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3629    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3630 
3631   }
3632 
3633   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3634   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3635   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3636   PetscFunctionReturn(0);
3637 }
3638 
3639 #undef __FUNCT__
3640 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3641 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3642 {
3643   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3644   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3645   PetscErrorCode ierr;
3646   PetscInt       *diag = a->diag;
3647   MatScalar      *aa=a->a;
3648   PetscScalar    *x,*b;
3649 
3650   PetscFunctionBegin;
3651   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3652   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3653 
3654   {
3655     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3656     MatScalar  *v,*t=(MatScalar *)x;
3657     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3658 
3659     /* forward solve the lower triangular */
3660     idx  = 0;
3661     t[0] = (MatScalar)b[0];
3662     t[1] = (MatScalar)b[1];
3663     t[2] = (MatScalar)b[2];
3664     t[3] = (MatScalar)b[3];
3665     for (i=1; i<n; i++) {
3666       v     =  aa      + 16*ai[i];
3667       vi    =  aj      + ai[i];
3668       nz    =  diag[i] - ai[i];
3669       idx   +=  4;
3670       s1 = (MatScalar)b[idx];
3671       s2 = (MatScalar)b[1+idx];
3672       s3 = (MatScalar)b[2+idx];
3673       s4 = (MatScalar)b[3+idx];
3674       while (nz--) {
3675         jdx = 4*(*vi++);
3676         x1  = t[jdx];
3677         x2  = t[1+jdx];
3678         x3  = t[2+jdx];
3679         x4  = t[3+jdx];
3680         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3681         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3682         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3683         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3684         v    += 16;
3685       }
3686       t[idx]   = s1;
3687       t[1+idx] = s2;
3688       t[2+idx] = s3;
3689       t[3+idx] = s4;
3690     }
3691     /* backward solve the upper triangular */
3692     idt = 4*(n-1);
3693     for (i=n-1; i>=0; i--){
3694       ai16 = 16*diag[i];
3695       v    = aa + ai16 + 16;
3696       vi   = aj + diag[i] + 1;
3697       nz   = ai[i+1] - diag[i] - 1;
3698       s1   = t[idt];
3699       s2   = t[1+idt];
3700       s3   = t[2+idt];
3701       s4   = t[3+idt];
3702       while (nz--) {
3703         idx = 4*(*vi++);
3704         x1  = (MatScalar)x[idx];
3705         x2  = (MatScalar)x[1+idx];
3706         x3  = (MatScalar)x[2+idx];
3707         x4  = (MatScalar)x[3+idx];
3708         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3709         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3710         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3711         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3712         v    += 16;
3713       }
3714       v        = aa + ai16;
3715       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3716       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3717       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3718       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3719       idt -= 4;
3720     }
3721   }
3722 
3723   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3724   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3725   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3726   PetscFunctionReturn(0);
3727 }
3728 
3729 #if defined (PETSC_HAVE_SSE)
3730 
3731 #include PETSC_HAVE_SSE
3732 #undef __FUNCT__
3733 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3734 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3735 {
3736   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3737   unsigned short *aj=(unsigned short *)a->j;
3738   PetscErrorCode ierr;
3739   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3740   MatScalar      *aa=a->a;
3741   PetscScalar    *x,*b;
3742 
3743   PetscFunctionBegin;
3744   SSE_SCOPE_BEGIN;
3745   /*
3746      Note: This code currently uses demotion of double
3747      to float when performing the mixed-mode computation.
3748      This may not be numerically reasonable for all applications.
3749   */
3750   PREFETCH_NTA(aa+16*ai[1]);
3751 
3752   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3753   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3754   {
3755     /* x will first be computed in single precision then promoted inplace to double */
3756     MatScalar      *v,*t=(MatScalar *)x;
3757     int            nz,i,idt,ai16;
3758     unsigned int   jdx,idx;
3759     unsigned short *vi;
3760     /* Forward solve the lower triangular factor. */
3761 
3762     /* First block is the identity. */
3763     idx  = 0;
3764     CONVERT_DOUBLE4_FLOAT4(t,b);
3765     v    =  aa + 16*((unsigned int)ai[1]);
3766 
3767     for (i=1; i<n;) {
3768       PREFETCH_NTA(&v[8]);
3769       vi   =  aj      + ai[i];
3770       nz   =  diag[i] - ai[i];
3771       idx +=  4;
3772 
3773       /* Demote RHS from double to float. */
3774       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3775       LOAD_PS(&t[idx],XMM7);
3776 
3777       while (nz--) {
3778         PREFETCH_NTA(&v[16]);
3779         jdx = 4*((unsigned int)(*vi++));
3780 
3781         /* 4x4 Matrix-Vector product with negative accumulation: */
3782         SSE_INLINE_BEGIN_2(&t[jdx],v)
3783           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3784 
3785           /* First Column */
3786           SSE_COPY_PS(XMM0,XMM6)
3787           SSE_SHUFFLE(XMM0,XMM0,0x00)
3788           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3789           SSE_SUB_PS(XMM7,XMM0)
3790 
3791           /* Second Column */
3792           SSE_COPY_PS(XMM1,XMM6)
3793           SSE_SHUFFLE(XMM1,XMM1,0x55)
3794           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3795           SSE_SUB_PS(XMM7,XMM1)
3796 
3797           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3798 
3799           /* Third Column */
3800           SSE_COPY_PS(XMM2,XMM6)
3801           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3802           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3803           SSE_SUB_PS(XMM7,XMM2)
3804 
3805           /* Fourth Column */
3806           SSE_COPY_PS(XMM3,XMM6)
3807           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3808           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3809           SSE_SUB_PS(XMM7,XMM3)
3810         SSE_INLINE_END_2
3811 
3812         v  += 16;
3813       }
3814       v    =  aa + 16*ai[++i];
3815       PREFETCH_NTA(v);
3816       STORE_PS(&t[idx],XMM7);
3817     }
3818 
3819     /* Backward solve the upper triangular factor.*/
3820 
3821     idt  = 4*(n-1);
3822     ai16 = 16*diag[n-1];
3823     v    = aa + ai16 + 16;
3824     for (i=n-1; i>=0;){
3825       PREFETCH_NTA(&v[8]);
3826       vi = aj + diag[i] + 1;
3827       nz = ai[i+1] - diag[i] - 1;
3828 
3829       LOAD_PS(&t[idt],XMM7);
3830 
3831       while (nz--) {
3832         PREFETCH_NTA(&v[16]);
3833         idx = 4*((unsigned int)(*vi++));
3834 
3835         /* 4x4 Matrix-Vector Product with negative accumulation: */
3836         SSE_INLINE_BEGIN_2(&t[idx],v)
3837           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3838 
3839           /* First Column */
3840           SSE_COPY_PS(XMM0,XMM6)
3841           SSE_SHUFFLE(XMM0,XMM0,0x00)
3842           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3843           SSE_SUB_PS(XMM7,XMM0)
3844 
3845           /* Second Column */
3846           SSE_COPY_PS(XMM1,XMM6)
3847           SSE_SHUFFLE(XMM1,XMM1,0x55)
3848           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3849           SSE_SUB_PS(XMM7,XMM1)
3850 
3851           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3852 
3853           /* Third Column */
3854           SSE_COPY_PS(XMM2,XMM6)
3855           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3856           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3857           SSE_SUB_PS(XMM7,XMM2)
3858 
3859           /* Fourth Column */
3860           SSE_COPY_PS(XMM3,XMM6)
3861           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3862           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3863           SSE_SUB_PS(XMM7,XMM3)
3864         SSE_INLINE_END_2
3865         v  += 16;
3866       }
3867       v    = aa + ai16;
3868       ai16 = 16*diag[--i];
3869       PREFETCH_NTA(aa+ai16+16);
3870       /*
3871          Scale the result by the diagonal 4x4 block,
3872          which was inverted as part of the factorization
3873       */
3874       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3875         /* First Column */
3876         SSE_COPY_PS(XMM0,XMM7)
3877         SSE_SHUFFLE(XMM0,XMM0,0x00)
3878         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3879 
3880         /* Second Column */
3881         SSE_COPY_PS(XMM1,XMM7)
3882         SSE_SHUFFLE(XMM1,XMM1,0x55)
3883         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3884         SSE_ADD_PS(XMM0,XMM1)
3885 
3886         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3887 
3888         /* Third Column */
3889         SSE_COPY_PS(XMM2,XMM7)
3890         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3891         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3892         SSE_ADD_PS(XMM0,XMM2)
3893 
3894         /* Fourth Column */
3895         SSE_COPY_PS(XMM3,XMM7)
3896         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3897         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3898         SSE_ADD_PS(XMM0,XMM3)
3899 
3900         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3901       SSE_INLINE_END_3
3902 
3903       v    = aa + ai16 + 16;
3904       idt -= 4;
3905     }
3906 
3907     /* Convert t from single precision back to double precision (inplace)*/
3908     idt = 4*(n-1);
3909     for (i=n-1;i>=0;i--) {
3910       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3911       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3912       PetscScalar *xtemp=&x[idt];
3913       MatScalar   *ttemp=&t[idt];
3914       xtemp[3] = (PetscScalar)ttemp[3];
3915       xtemp[2] = (PetscScalar)ttemp[2];
3916       xtemp[1] = (PetscScalar)ttemp[1];
3917       xtemp[0] = (PetscScalar)ttemp[0];
3918       idt -= 4;
3919     }
3920 
3921   } /* End of artificial scope. */
3922   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3923   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3924   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3925   SSE_SCOPE_END;
3926   PetscFunctionReturn(0);
3927 }
3928 
3929 #undef __FUNCT__
3930 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3931 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3932 {
3933   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3934   int            *aj=a->j;
3935   PetscErrorCode ierr;
3936   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3937   MatScalar      *aa=a->a;
3938   PetscScalar    *x,*b;
3939 
3940   PetscFunctionBegin;
3941   SSE_SCOPE_BEGIN;
3942   /*
3943      Note: This code currently uses demotion of double
3944      to float when performing the mixed-mode computation.
3945      This may not be numerically reasonable for all applications.
3946   */
3947   PREFETCH_NTA(aa+16*ai[1]);
3948 
3949   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3950   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3951   {
3952     /* x will first be computed in single precision then promoted inplace to double */
3953     MatScalar *v,*t=(MatScalar *)x;
3954     int       nz,i,idt,ai16;
3955     int       jdx,idx;
3956     int       *vi;
3957     /* Forward solve the lower triangular factor. */
3958 
3959     /* First block is the identity. */
3960     idx  = 0;
3961     CONVERT_DOUBLE4_FLOAT4(t,b);
3962     v    =  aa + 16*ai[1];
3963 
3964     for (i=1; i<n;) {
3965       PREFETCH_NTA(&v[8]);
3966       vi   =  aj      + ai[i];
3967       nz   =  diag[i] - ai[i];
3968       idx +=  4;
3969 
3970       /* Demote RHS from double to float. */
3971       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3972       LOAD_PS(&t[idx],XMM7);
3973 
3974       while (nz--) {
3975         PREFETCH_NTA(&v[16]);
3976         jdx = 4*(*vi++);
3977 /*          jdx = *vi++; */
3978 
3979         /* 4x4 Matrix-Vector product with negative accumulation: */
3980         SSE_INLINE_BEGIN_2(&t[jdx],v)
3981           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3982 
3983           /* First Column */
3984           SSE_COPY_PS(XMM0,XMM6)
3985           SSE_SHUFFLE(XMM0,XMM0,0x00)
3986           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3987           SSE_SUB_PS(XMM7,XMM0)
3988 
3989           /* Second Column */
3990           SSE_COPY_PS(XMM1,XMM6)
3991           SSE_SHUFFLE(XMM1,XMM1,0x55)
3992           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3993           SSE_SUB_PS(XMM7,XMM1)
3994 
3995           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3996 
3997           /* Third Column */
3998           SSE_COPY_PS(XMM2,XMM6)
3999           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4000           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4001           SSE_SUB_PS(XMM7,XMM2)
4002 
4003           /* Fourth Column */
4004           SSE_COPY_PS(XMM3,XMM6)
4005           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4006           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4007           SSE_SUB_PS(XMM7,XMM3)
4008         SSE_INLINE_END_2
4009 
4010         v  += 16;
4011       }
4012       v    =  aa + 16*ai[++i];
4013       PREFETCH_NTA(v);
4014       STORE_PS(&t[idx],XMM7);
4015     }
4016 
4017     /* Backward solve the upper triangular factor.*/
4018 
4019     idt  = 4*(n-1);
4020     ai16 = 16*diag[n-1];
4021     v    = aa + ai16 + 16;
4022     for (i=n-1; i>=0;){
4023       PREFETCH_NTA(&v[8]);
4024       vi = aj + diag[i] + 1;
4025       nz = ai[i+1] - diag[i] - 1;
4026 
4027       LOAD_PS(&t[idt],XMM7);
4028 
4029       while (nz--) {
4030         PREFETCH_NTA(&v[16]);
4031         idx = 4*(*vi++);
4032 /*          idx = *vi++; */
4033 
4034         /* 4x4 Matrix-Vector Product with negative accumulation: */
4035         SSE_INLINE_BEGIN_2(&t[idx],v)
4036           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4037 
4038           /* First Column */
4039           SSE_COPY_PS(XMM0,XMM6)
4040           SSE_SHUFFLE(XMM0,XMM0,0x00)
4041           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4042           SSE_SUB_PS(XMM7,XMM0)
4043 
4044           /* Second Column */
4045           SSE_COPY_PS(XMM1,XMM6)
4046           SSE_SHUFFLE(XMM1,XMM1,0x55)
4047           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4048           SSE_SUB_PS(XMM7,XMM1)
4049 
4050           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4051 
4052           /* Third Column */
4053           SSE_COPY_PS(XMM2,XMM6)
4054           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4055           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4056           SSE_SUB_PS(XMM7,XMM2)
4057 
4058           /* Fourth Column */
4059           SSE_COPY_PS(XMM3,XMM6)
4060           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4061           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4062           SSE_SUB_PS(XMM7,XMM3)
4063         SSE_INLINE_END_2
4064         v  += 16;
4065       }
4066       v    = aa + ai16;
4067       ai16 = 16*diag[--i];
4068       PREFETCH_NTA(aa+ai16+16);
4069       /*
4070          Scale the result by the diagonal 4x4 block,
4071          which was inverted as part of the factorization
4072       */
4073       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4074         /* First Column */
4075         SSE_COPY_PS(XMM0,XMM7)
4076         SSE_SHUFFLE(XMM0,XMM0,0x00)
4077         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4078 
4079         /* Second Column */
4080         SSE_COPY_PS(XMM1,XMM7)
4081         SSE_SHUFFLE(XMM1,XMM1,0x55)
4082         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4083         SSE_ADD_PS(XMM0,XMM1)
4084 
4085         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4086 
4087         /* Third Column */
4088         SSE_COPY_PS(XMM2,XMM7)
4089         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4090         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4091         SSE_ADD_PS(XMM0,XMM2)
4092 
4093         /* Fourth Column */
4094         SSE_COPY_PS(XMM3,XMM7)
4095         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4096         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4097         SSE_ADD_PS(XMM0,XMM3)
4098 
4099         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4100       SSE_INLINE_END_3
4101 
4102       v    = aa + ai16 + 16;
4103       idt -= 4;
4104     }
4105 
4106     /* Convert t from single precision back to double precision (inplace)*/
4107     idt = 4*(n-1);
4108     for (i=n-1;i>=0;i--) {
4109       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4110       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4111       PetscScalar *xtemp=&x[idt];
4112       MatScalar   *ttemp=&t[idt];
4113       xtemp[3] = (PetscScalar)ttemp[3];
4114       xtemp[2] = (PetscScalar)ttemp[2];
4115       xtemp[1] = (PetscScalar)ttemp[1];
4116       xtemp[0] = (PetscScalar)ttemp[0];
4117       idt -= 4;
4118     }
4119 
4120   } /* End of artificial scope. */
4121   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4122   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4123   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4124   SSE_SCOPE_END;
4125   PetscFunctionReturn(0);
4126 }
4127 
4128 #endif
4129 
4130 #undef __FUNCT__
4131 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4132 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4133 {
4134   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4135   IS                iscol=a->col,isrow=a->row;
4136   PetscErrorCode    ierr;
4137   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4138   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4139   const MatScalar   *aa=a->a,*v;
4140   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4141   const PetscScalar *b;
4142 
4143   PetscFunctionBegin;
4144   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4145   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4146   t  = a->solve_work;
4147 
4148   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4149   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4150 
4151   /* forward solve the lower triangular */
4152   idx    = 3*(*r++);
4153   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4154   for (i=1; i<n; i++) {
4155     v     = aa + 9*ai[i];
4156     vi    = aj + ai[i];
4157     nz    = diag[i] - ai[i];
4158     idx   = 3*(*r++);
4159     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4160     while (nz--) {
4161       idx   = 3*(*vi++);
4162       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4163       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4164       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4165       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4166       v += 9;
4167     }
4168     idx = 3*i;
4169     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4170   }
4171   /* backward solve the upper triangular */
4172   for (i=n-1; i>=0; i--){
4173     v    = aa + 9*diag[i] + 9;
4174     vi   = aj + diag[i] + 1;
4175     nz   = ai[i+1] - diag[i] - 1;
4176     idt  = 3*i;
4177     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4178     while (nz--) {
4179       idx   = 3*(*vi++);
4180       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4181       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4182       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4183       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4184       v += 9;
4185     }
4186     idc = 3*(*c--);
4187     v   = aa + 9*diag[i];
4188     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4189     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4190     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4191   }
4192   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4193   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4194   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4197   PetscFunctionReturn(0);
4198 }
4199 
4200 #undef __FUNCT__
4201 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4202 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4203 {
4204   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4205   IS                iscol=a->col,isrow=a->row;
4206   PetscErrorCode    ierr;
4207   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
4208   const PetscInt    *r,*c,*rout,*cout;
4209   const MatScalar   *aa=a->a,*v;
4210   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4211   const PetscScalar *b;
4212 
4213   PetscFunctionBegin;
4214   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4215   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4216   t  = a->solve_work;
4217 
4218   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4219   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4220 
4221   /* forward solve the lower triangular */
4222   idx    = 3*r[0];
4223   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4224   for (i=1; i<n; i++) {
4225     v     = aa + 9*ai[i];
4226     vi    = aj + ai[i];
4227     nz    = ai[i+1] - ai[i];
4228     idx   = 3*r[i];
4229     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4230     for(m=0;m<nz;m++){
4231       idx   = 3*vi[m];
4232       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4233       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4234       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4235       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4236       v += 9;
4237     }
4238     idx = 3*i;
4239     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4240   }
4241   /* backward solve the upper triangular */
4242   for (i=n-1; i>=0; i--){
4243     k    = 2*n-i;
4244     v    = aa + 9*ai[k];
4245     vi   = aj + ai[k];
4246     nz   = ai[k +1] - ai[k] - 1;
4247     idt  = 3*i;
4248     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4249     for(m=0;m<nz;m++){
4250       idx   = 3*vi[m];
4251       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4252       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4253       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4254       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4255       v += 9;
4256     }
4257     idc = 3*c[i];
4258     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4259     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4260     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4261   }
4262   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4263   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4264   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4265   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4266   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4267   PetscFunctionReturn(0);
4268 }
4269 
4270 #undef __FUNCT__
4271 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
4272 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4273 {
4274   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4275   IS                iscol=a->col,isrow=a->row;
4276   PetscErrorCode    ierr;
4277   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4278   const PetscInt    *r,*c,*rout,*cout;
4279   const MatScalar   *aa=a->a,*v;
4280   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4281   const PetscScalar *b;
4282 
4283   PetscFunctionBegin;
4284   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4285   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4286   t  = a->solve_work;
4287 
4288   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4289   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4290 
4291   /* forward solve the lower triangular */
4292   idx    = 3*r[0];
4293   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4294   for (i=1; i<n; i++) {
4295     v     = aa + 9*ai[i];
4296     vi    = aj + ai[i];
4297     nz    = ai[i+1] - ai[i];
4298     idx   = 3*r[i];
4299     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4300     for(m=0;m<nz;m++){
4301       idx   = 3*vi[m];
4302       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4303       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4304       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4305       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4306       v += 9;
4307     }
4308     idx = 3*i;
4309     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4310   }
4311   /* backward solve the upper triangular */
4312   for (i=n-1; i>=0; i--){
4313     v    = aa + 9*(adiag[i+1]+1);
4314     vi   = aj + adiag[i+1]+1;
4315     nz   = adiag[i] - adiag[i+1] - 1;
4316     idt  = 3*i;
4317     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4318     for(m=0;m<nz;m++){
4319       idx   = 3*vi[m];
4320       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4321       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4322       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4323       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4324       v += 9;
4325     }
4326     idc = 3*c[i];
4327     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4328     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4329     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4330   }
4331   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4332   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4333   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4334   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4335   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4336   PetscFunctionReturn(0);
4337 }
4338 
4339 /*
4340       Special case where the matrix was ILU(0) factored in the natural
4341    ordering. This eliminates the need for the column and row permutation.
4342 */
4343 #undef __FUNCT__
4344 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4345 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4346 {
4347   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4348   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4349   PetscErrorCode    ierr;
4350   PetscInt          *diag = a->diag;
4351   const MatScalar   *aa=a->a,*v;
4352   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4353   const PetscScalar *b;
4354   PetscInt          jdx,idt,idx,nz,*vi,i;
4355 
4356   PetscFunctionBegin;
4357   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4358   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4359 
4360   /* forward solve the lower triangular */
4361   idx    = 0;
4362   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4363   for (i=1; i<n; i++) {
4364     v     =  aa      + 9*ai[i];
4365     vi    =  aj      + ai[i];
4366     nz    =  diag[i] - ai[i];
4367     idx   +=  3;
4368     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4369     while (nz--) {
4370       jdx   = 3*(*vi++);
4371       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4372       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4373       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4374       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4375       v    += 9;
4376     }
4377     x[idx]   = s1;
4378     x[1+idx] = s2;
4379     x[2+idx] = s3;
4380   }
4381   /* backward solve the upper triangular */
4382   for (i=n-1; i>=0; i--){
4383     v    = aa + 9*diag[i] + 9;
4384     vi   = aj + diag[i] + 1;
4385     nz   = ai[i+1] - diag[i] - 1;
4386     idt  = 3*i;
4387     s1 = x[idt];  s2 = x[1+idt];
4388     s3 = x[2+idt];
4389     while (nz--) {
4390       idx   = 3*(*vi++);
4391       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4392       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4393       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4394       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4395       v    += 9;
4396     }
4397     v        = aa +  9*diag[i];
4398     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4399     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4400     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4401   }
4402 
4403   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4404   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4405   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4406   PetscFunctionReturn(0);
4407 }
4408 
4409 #undef __FUNCT__
4410 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4411 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4412 {
4413     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4414     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4415     PetscErrorCode    ierr;
4416     PetscInt          idx,jdx,idt;
4417     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4418     const MatScalar   *aa=a->a,*v;
4419     PetscScalar       *x;
4420     const PetscScalar *b;
4421     PetscScalar        s1,s2,s3,x1,x2,x3;
4422 
4423     PetscFunctionBegin;
4424     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4425     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4426     /* forward solve the lower triangular */
4427     idx    = 0;
4428     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4429     for (i=1; i<n; i++) {
4430        v    = aa + bs2*ai[i];
4431        vi   = aj + ai[i];
4432        nz   = ai[i+1] - ai[i];
4433       idx   = bs*i;
4434        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4435       for(k=0;k<nz;k++){
4436          jdx   = bs*vi[k];
4437           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4438           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4439           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4440           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4441 
4442           v   +=  bs2;
4443         }
4444 
4445        x[idx]   = s1;
4446        x[1+idx] = s2;
4447        x[2+idx] = s3;
4448     }
4449 
4450    /* backward solve the upper triangular */
4451   for (i=n-1; i>=0; i--){
4452      v   = aa + bs2*ai[2*n-i];
4453      vi  = aj + ai[2*n-i];
4454      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4455      idt = bs*i;
4456      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4457 
4458      for(k=0;k<nz;k++){
4459        idx   = bs*vi[k];
4460        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4461        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4462        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4463        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4464 
4465         v   +=  bs2;
4466     }
4467     /* x = inv_diagonal*x */
4468    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4469    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4470    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4471 
4472   }
4473 
4474   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4475   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4476   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4477   PetscFunctionReturn(0);
4478 }
4479 
4480 #undef __FUNCT__
4481 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4482 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4483 {
4484     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4485     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4486     PetscErrorCode    ierr;
4487     PetscInt          idx,jdx,idt;
4488     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4489     const MatScalar   *aa=a->a,*v;
4490     PetscScalar       *x;
4491     const PetscScalar *b;
4492     PetscScalar        s1,s2,s3,x1,x2,x3;
4493 
4494     PetscFunctionBegin;
4495     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4496     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4497     /* forward solve the lower triangular */
4498     idx    = 0;
4499     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4500     for (i=1; i<n; i++) {
4501        v    = aa + bs2*ai[i];
4502        vi   = aj + ai[i];
4503        nz   = ai[i+1] - ai[i];
4504       idx   = bs*i;
4505        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4506       for(k=0;k<nz;k++){
4507          jdx   = bs*vi[k];
4508           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4509           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4510           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4511           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4512 
4513           v   +=  bs2;
4514         }
4515 
4516        x[idx]   = s1;
4517        x[1+idx] = s2;
4518        x[2+idx] = s3;
4519     }
4520 
4521    /* backward solve the upper triangular */
4522   for (i=n-1; i>=0; i--){
4523     v   = aa + bs2*(adiag[i+1]+1);
4524      vi  = aj + adiag[i+1]+1;
4525      nz  = adiag[i] - adiag[i+1]-1;
4526      idt = bs*i;
4527      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4528 
4529      for(k=0;k<nz;k++){
4530        idx   = bs*vi[k];
4531        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4532        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4533        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4534        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4535 
4536         v   +=  bs2;
4537     }
4538     /* x = inv_diagonal*x */
4539    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4540    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4541    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4542 
4543   }
4544 
4545   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4546   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4547   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4548   PetscFunctionReturn(0);
4549 }
4550 
4551 #undef __FUNCT__
4552 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4553 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4554 {
4555   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4556   IS                iscol=a->col,isrow=a->row;
4557   PetscErrorCode    ierr;
4558   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4559   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4560   const MatScalar   *aa=a->a,*v;
4561   PetscScalar       *x,s1,s2,x1,x2,*t;
4562   const PetscScalar *b;
4563 
4564   PetscFunctionBegin;
4565   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4566   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4567   t  = a->solve_work;
4568 
4569   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4570   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4571 
4572   /* forward solve the lower triangular */
4573   idx    = 2*(*r++);
4574   t[0] = b[idx]; t[1] = b[1+idx];
4575   for (i=1; i<n; i++) {
4576     v     = aa + 4*ai[i];
4577     vi    = aj + ai[i];
4578     nz    = diag[i] - ai[i];
4579     idx   = 2*(*r++);
4580     s1  = b[idx]; s2 = b[1+idx];
4581     while (nz--) {
4582       idx   = 2*(*vi++);
4583       x1    = t[idx]; x2 = t[1+idx];
4584       s1 -= v[0]*x1 + v[2]*x2;
4585       s2 -= v[1]*x1 + v[3]*x2;
4586       v += 4;
4587     }
4588     idx = 2*i;
4589     t[idx] = s1; t[1+idx] = s2;
4590   }
4591   /* backward solve the upper triangular */
4592   for (i=n-1; i>=0; i--){
4593     v    = aa + 4*diag[i] + 4;
4594     vi   = aj + diag[i] + 1;
4595     nz   = ai[i+1] - diag[i] - 1;
4596     idt  = 2*i;
4597     s1 = t[idt]; s2 = t[1+idt];
4598     while (nz--) {
4599       idx   = 2*(*vi++);
4600       x1    = t[idx]; x2 = t[1+idx];
4601       s1 -= v[0]*x1 + v[2]*x2;
4602       s2 -= v[1]*x1 + v[3]*x2;
4603       v += 4;
4604     }
4605     idc = 2*(*c--);
4606     v   = aa + 4*diag[i];
4607     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4608     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4609   }
4610   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4611   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4612   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4613   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4614   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4615   PetscFunctionReturn(0);
4616 }
4617 
4618 #undef __FUNCT__
4619 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4620 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4621 {
4622   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4623   IS                iscol=a->col,isrow=a->row;
4624   PetscErrorCode    ierr;
4625   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
4626   const PetscInt    *r,*c,*rout,*cout;
4627   const MatScalar   *aa=a->a,*v;
4628   PetscScalar       *x,s1,s2,x1,x2,*t;
4629   const PetscScalar *b;
4630 
4631   PetscFunctionBegin;
4632   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4633   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4634   t  = a->solve_work;
4635 
4636   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4637   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4638 
4639   /* forward solve the lower triangular */
4640   idx    = 2*r[0];
4641   t[0] = b[idx]; t[1] = b[1+idx];
4642   for (i=1; i<n; i++) {
4643     v     = aa + 4*ai[i];
4644     vi    = aj + ai[i];
4645     nz    = ai[i+1] - ai[i];
4646     idx   = 2*r[i];
4647     s1  = b[idx]; s2 = b[1+idx];
4648     for(m=0;m<nz;m++){
4649       jdx   = 2*vi[m];
4650       x1    = t[jdx]; x2 = t[1+jdx];
4651       s1 -= v[0]*x1 + v[2]*x2;
4652       s2 -= v[1]*x1 + v[3]*x2;
4653       v += 4;
4654     }
4655     idx = 2*i;
4656     t[idx] = s1; t[1+idx] = s2;
4657   }
4658   /* backward solve the upper triangular */
4659   for (i=n-1; i>=0; i--){
4660     k = 2*n-i;
4661     v    = aa + 4*ai[k];
4662     vi   = aj + ai[k];
4663     nz   = ai[k +1] - ai[k] - 1;
4664     idt  = 2*i;
4665     s1 = t[idt]; s2 = t[1+idt];
4666     for(m=0;m<nz;m++){
4667       idx   = 2*vi[m];
4668       x1    = t[idx]; x2 = t[1+idx];
4669       s1 -= v[0]*x1 + v[2]*x2;
4670       s2 -= v[1]*x1 + v[3]*x2;
4671       v += 4;
4672     }
4673     idc = 2*c[i];
4674     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4675     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4676   }
4677   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4678   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4679   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4681   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4682   PetscFunctionReturn(0);
4683 }
4684 
4685 #undef __FUNCT__
4686 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
4687 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4688 {
4689   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4690   IS                iscol=a->col,isrow=a->row;
4691   PetscErrorCode    ierr;
4692   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4693   const PetscInt    *r,*c,*rout,*cout;
4694   const MatScalar   *aa=a->a,*v;
4695   PetscScalar       *x,s1,s2,x1,x2,*t;
4696   const PetscScalar *b;
4697 
4698   PetscFunctionBegin;
4699   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4700   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4701   t  = a->solve_work;
4702 
4703   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4704   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4705 
4706   /* forward solve the lower triangular */
4707   idx    = 2*r[0];
4708   t[0] = b[idx]; t[1] = b[1+idx];
4709   for (i=1; i<n; i++) {
4710     v     = aa + 4*ai[i];
4711     vi    = aj + ai[i];
4712     nz    = ai[i+1] - ai[i];
4713     idx   = 2*r[i];
4714     s1  = b[idx]; s2 = b[1+idx];
4715     for(m=0;m<nz;m++){
4716       jdx   = 2*vi[m];
4717       x1    = t[jdx]; x2 = t[1+jdx];
4718       s1 -= v[0]*x1 + v[2]*x2;
4719       s2 -= v[1]*x1 + v[3]*x2;
4720       v += 4;
4721     }
4722     idx = 2*i;
4723     t[idx] = s1; t[1+idx] = s2;
4724   }
4725   /* backward solve the upper triangular */
4726   for (i=n-1; i>=0; i--){
4727     v    = aa + 4*(adiag[i+1]+1);
4728     vi   = aj + adiag[i+1]+1;
4729     nz   = adiag[i] - adiag[i+1] - 1;
4730     idt  = 2*i;
4731     s1 = t[idt]; s2 = t[1+idt];
4732     for(m=0;m<nz;m++){
4733       idx   = 2*vi[m];
4734       x1    = t[idx]; x2 = t[1+idx];
4735       s1 -= v[0]*x1 + v[2]*x2;
4736       s2 -= v[1]*x1 + v[3]*x2;
4737       v += 4;
4738     }
4739     idc = 2*c[i];
4740     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4741     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4742   }
4743   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4744   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4745   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4746   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4747   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4748   PetscFunctionReturn(0);
4749 }
4750 
4751 /*
4752       Special case where the matrix was ILU(0) factored in the natural
4753    ordering. This eliminates the need for the column and row permutation.
4754 */
4755 #undef __FUNCT__
4756 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4757 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4758 {
4759   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4760   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4761   PetscErrorCode    ierr;
4762   PetscInt          *diag = a->diag;
4763   const MatScalar   *aa=a->a,*v;
4764   PetscScalar       *x,s1,s2,x1,x2;
4765   const PetscScalar *b;
4766   PetscInt          jdx,idt,idx,nz,*vi,i;
4767 
4768   PetscFunctionBegin;
4769   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4770   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4771 
4772   /* forward solve the lower triangular */
4773   idx    = 0;
4774   x[0]   = b[0]; x[1] = b[1];
4775   for (i=1; i<n; i++) {
4776     v     =  aa      + 4*ai[i];
4777     vi    =  aj      + ai[i];
4778     nz    =  diag[i] - ai[i];
4779     idx   +=  2;
4780     s1  =  b[idx];s2 = b[1+idx];
4781     while (nz--) {
4782       jdx   = 2*(*vi++);
4783       x1    = x[jdx];x2 = x[1+jdx];
4784       s1 -= v[0]*x1 + v[2]*x2;
4785       s2 -= v[1]*x1 + v[3]*x2;
4786       v    += 4;
4787     }
4788     x[idx]   = s1;
4789     x[1+idx] = s2;
4790   }
4791   /* backward solve the upper triangular */
4792   for (i=n-1; i>=0; i--){
4793     v    = aa + 4*diag[i] + 4;
4794     vi   = aj + diag[i] + 1;
4795     nz   = ai[i+1] - diag[i] - 1;
4796     idt  = 2*i;
4797     s1 = x[idt];  s2 = x[1+idt];
4798     while (nz--) {
4799       idx   = 2*(*vi++);
4800       x1    = x[idx];   x2 = x[1+idx];
4801       s1 -= v[0]*x1 + v[2]*x2;
4802       s2 -= v[1]*x1 + v[3]*x2;
4803       v    += 4;
4804     }
4805     v        = aa +  4*diag[i];
4806     x[idt]   = v[0]*s1 + v[2]*s2;
4807     x[1+idt] = v[1]*s1 + v[3]*s2;
4808   }
4809 
4810   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4811   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4812   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4813   PetscFunctionReturn(0);
4814 }
4815 
4816 #undef __FUNCT__
4817 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4818 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4819 {
4820     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4821     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4822     PetscErrorCode    ierr;
4823     PetscInt          jdx;
4824     const MatScalar   *aa=a->a,*v;
4825     PetscScalar       *x,s1,s2,x1,x2;
4826     const PetscScalar *b;
4827 
4828     PetscFunctionBegin;
4829     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4830     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4831     /* forward solve the lower triangular */
4832     idx    = 0;
4833     x[0] = b[idx]; x[1] = b[1+idx];
4834     for (i=1; i<n; i++) {
4835         v   = aa + 4*ai[i];
4836        vi   = aj + ai[i];
4837        nz   = ai[i+1] - ai[i];
4838        idx  = 2*i;
4839        s1   = b[idx];s2 = b[1+idx];
4840       for(k=0;k<nz;k++){
4841          jdx   = 2*vi[k];
4842           x1    = x[jdx];x2 = x[1+jdx];
4843           s1   -= v[0]*x1 + v[2]*x2;
4844           s2   -= v[1]*x1 + v[3]*x2;
4845            v   +=  4;
4846         }
4847        x[idx]   = s1;
4848        x[1+idx] = s2;
4849     }
4850 
4851    /* backward solve the upper triangular */
4852   for (i=n-1; i>=0; i--){
4853      v   = aa + 4*ai[2*n-i];
4854      vi  = aj + ai[2*n-i];
4855      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4856      idt = 2*i;
4857      s1 = x[idt];  s2 = x[1+idt];
4858      for(k=0;k<nz;k++){
4859       idx   = 2*vi[k];
4860        x1    = x[idx];   x2 = x[1+idx];
4861        s1 -= v[0]*x1 + v[2]*x2;
4862        s2 -= v[1]*x1 + v[3]*x2;
4863          v    += 4;
4864     }
4865     /* x = inv_diagonal*x */
4866    x[idt]   = v[0]*s1 + v[2]*s2;
4867    x[1+idt] = v[1]*s1 + v[3]*s2;
4868   }
4869 
4870   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4871   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4872   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4873   PetscFunctionReturn(0);
4874 }
4875 
4876 #undef __FUNCT__
4877 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4878 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4879 {
4880     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4881     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4882     PetscErrorCode    ierr;
4883     PetscInt          jdx;
4884     const MatScalar   *aa=a->a,*v;
4885     PetscScalar       *x,s1,s2,x1,x2;
4886     const PetscScalar *b;
4887 
4888     PetscFunctionBegin;
4889     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4890     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4891     /* forward solve the lower triangular */
4892     idx    = 0;
4893     x[0] = b[idx]; x[1] = b[1+idx];
4894     for (i=1; i<n; i++) {
4895         v   = aa + 4*ai[i];
4896        vi   = aj + ai[i];
4897        nz   = ai[i+1] - ai[i];
4898        idx  = 2*i;
4899        s1   = b[idx];s2 = b[1+idx];
4900       for(k=0;k<nz;k++){
4901          jdx   = 2*vi[k];
4902           x1    = x[jdx];x2 = x[1+jdx];
4903           s1   -= v[0]*x1 + v[2]*x2;
4904           s2   -= v[1]*x1 + v[3]*x2;
4905            v   +=  4;
4906         }
4907        x[idx]   = s1;
4908        x[1+idx] = s2;
4909     }
4910 
4911    /* backward solve the upper triangular */
4912   for (i=n-1; i>=0; i--){
4913      v   = aa + 4*(adiag[i+1]+1);
4914      vi  = aj + adiag[i+1]+1;
4915      nz  = adiag[i] - adiag[i+1]-1;
4916      idt = 2*i;
4917      s1 = x[idt];  s2 = x[1+idt];
4918      for(k=0;k<nz;k++){
4919       idx   = 2*vi[k];
4920        x1    = x[idx];   x2 = x[1+idx];
4921        s1 -= v[0]*x1 + v[2]*x2;
4922        s2 -= v[1]*x1 + v[3]*x2;
4923          v    += 4;
4924     }
4925     /* x = inv_diagonal*x */
4926    x[idt]   = v[0]*s1 + v[2]*s2;
4927    x[1+idt] = v[1]*s1 + v[3]*s2;
4928   }
4929 
4930   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4932   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4933   PetscFunctionReturn(0);
4934 }
4935 
4936 #undef __FUNCT__
4937 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4938 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4939 {
4940   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4941   IS             iscol=a->col,isrow=a->row;
4942   PetscErrorCode ierr;
4943   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4944   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4945   MatScalar      *aa=a->a,*v;
4946   PetscScalar    *x,*b,s1,*t;
4947 
4948   PetscFunctionBegin;
4949   if (!n) PetscFunctionReturn(0);
4950 
4951   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4952   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4953   t  = a->solve_work;
4954 
4955   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4956   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4957 
4958   /* forward solve the lower triangular */
4959   t[0] = b[*r++];
4960   for (i=1; i<n; i++) {
4961     v     = aa + ai[i];
4962     vi    = aj + ai[i];
4963     nz    = diag[i] - ai[i];
4964     s1  = b[*r++];
4965     while (nz--) {
4966       s1 -= (*v++)*t[*vi++];
4967     }
4968     t[i] = s1;
4969   }
4970   /* backward solve the upper triangular */
4971   for (i=n-1; i>=0; i--){
4972     v    = aa + diag[i] + 1;
4973     vi   = aj + diag[i] + 1;
4974     nz   = ai[i+1] - diag[i] - 1;
4975     s1 = t[i];
4976     while (nz--) {
4977       s1 -= (*v++)*t[*vi++];
4978     }
4979     x[*c--] = t[i] = aa[diag[i]]*s1;
4980   }
4981 
4982   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4983   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4984   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4985   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4986   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4987   PetscFunctionReturn(0);
4988 }
4989 /*
4990       Special case where the matrix was ILU(0) factored in the natural
4991    ordering. This eliminates the need for the column and row permutation.
4992 */
4993 #undef __FUNCT__
4994 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4995 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4996 {
4997   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4998   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4999   PetscErrorCode ierr;
5000   PetscInt       *diag = a->diag;
5001   MatScalar      *aa=a->a;
5002   PetscScalar    *x,*b;
5003   PetscScalar    s1,x1;
5004   MatScalar      *v;
5005   PetscInt       jdx,idt,idx,nz,*vi,i;
5006 
5007   PetscFunctionBegin;
5008   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5009   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5010 
5011   /* forward solve the lower triangular */
5012   idx    = 0;
5013   x[0]   = b[0];
5014   for (i=1; i<n; i++) {
5015     v     =  aa      + ai[i];
5016     vi    =  aj      + ai[i];
5017     nz    =  diag[i] - ai[i];
5018     idx   +=  1;
5019     s1  =  b[idx];
5020     while (nz--) {
5021       jdx   = *vi++;
5022       x1    = x[jdx];
5023       s1 -= v[0]*x1;
5024       v    += 1;
5025     }
5026     x[idx]   = s1;
5027   }
5028   /* backward solve the upper triangular */
5029   for (i=n-1; i>=0; i--){
5030     v    = aa + diag[i] + 1;
5031     vi   = aj + diag[i] + 1;
5032     nz   = ai[i+1] - diag[i] - 1;
5033     idt  = i;
5034     s1 = x[idt];
5035     while (nz--) {
5036       idx   = *vi++;
5037       x1    = x[idx];
5038       s1 -= v[0]*x1;
5039       v    += 1;
5040     }
5041     v        = aa +  diag[i];
5042     x[idt]   = v[0]*s1;
5043   }
5044   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5045   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5046   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5047   PetscFunctionReturn(0);
5048 }
5049 
5050 /* ----------------------------------------------------------------*/
5051 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5052 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5053 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
5054 
5055 #undef __FUNCT__
5056 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
5057 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
5058 {
5059   Mat            C=B;
5060   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5061   IS             isrow = b->row,isicol = b->icol;
5062   PetscErrorCode ierr;
5063   const PetscInt *r,*ic,*ics;
5064   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5065   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5066   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5067   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5068   MatScalar      *v_work;
5069   PetscTruth     col_identity,row_identity,both_identity;
5070 
5071   PetscFunctionBegin;
5072   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5073   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5074 
5075   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5076   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5077   ics  = ic;
5078 
5079   /* generate work space needed by dense LU factorization */
5080   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5081 
5082   for (i=0; i<n; i++){
5083     /* zero rtmp */
5084     /* L part */
5085     nz    = bi[i+1] - bi[i];
5086     bjtmp = bj + bi[i];
5087     for  (j=0; j<nz; j++){
5088       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5089     }
5090 
5091     /* U part */
5092     nz = bdiag[i] - bdiag[i+1];
5093     bjtmp = bj + bdiag[i+1]+1;
5094     for  (j=0; j<nz; j++){
5095       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5096     }
5097 
5098     /* load in initial (unfactored row) */
5099     nz    = ai[r[i]+1] - ai[r[i]];
5100     ajtmp = aj + ai[r[i]];
5101     v     = aa + bs2*ai[r[i]];
5102     for (j=0; j<nz; j++) {
5103       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5104     }
5105 
5106     /* elimination */
5107     bjtmp = bj + bi[i];
5108     nzL   = bi[i+1] - bi[i];
5109     for(k=0;k < nzL;k++) {
5110       row = bjtmp[k];
5111       pc = rtmp + bs2*row;
5112       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5113       if (flg) {
5114         pv         = b->a + bs2*bdiag[row];
5115         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5116         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5117         pv         = b->a + bs2*(bdiag[row+1]+1);
5118         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5119         for (j=0; j<nz; j++) {
5120           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5121         }
5122         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5123       }
5124     }
5125 
5126     /* finished row so stick it into b->a */
5127     /* L part */
5128     pv   = b->a + bs2*bi[i] ;
5129     pj   = b->j + bi[i] ;
5130     nz   = bi[i+1] - bi[i];
5131     for (j=0; j<nz; j++) {
5132       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5133     }
5134 
5135     /* Mark diagonal and invert diagonal for simplier triangular solves */
5136     pv  = b->a + bs2*bdiag[i];
5137     pj  = b->j + bdiag[i];
5138     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5139     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5140     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5141 
5142     /* U part */
5143     pv = b->a + bs2*(bdiag[i+1]+1);
5144     pj = b->j + bdiag[i+1]+1;
5145     nz = bdiag[i] - bdiag[i+1] - 1;
5146     for (j=0; j<nz; j++){
5147       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5148     }
5149   }
5150 
5151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5152   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5153   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5154   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5155 
5156   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5157   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5158   both_identity = (PetscTruth) (row_identity && col_identity);
5159   if (both_identity){
5160     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2;
5161   } else {
5162     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2;
5163   }
5164 
5165   C->assembled = PETSC_TRUE;
5166   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5167   PetscFunctionReturn(0);
5168 }
5169 
5170 /*
5171    ilu(0) with natural ordering under new data structure.
5172    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5173    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5174 */
5175 
5176 #undef __FUNCT__
5177 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5178 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5179 {
5180 
5181   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5182   PetscErrorCode     ierr;
5183   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5184   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5185 
5186   PetscFunctionBegin;
5187   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5188   b    = (Mat_SeqBAIJ*)(fact)->data;
5189 
5190   /* allocate matrix arrays for new data structure */
5191   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5192   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5193   b->singlemalloc = PETSC_TRUE;
5194   if (!b->diag){
5195     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5196     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5197   }
5198   bdiag = b->diag;
5199 
5200   if (n > 0) {
5201     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5202   }
5203 
5204   /* set bi and bj with new data structure */
5205   bi = b->i;
5206   bj = b->j;
5207 
5208   /* L part */
5209   bi[0] = 0;
5210   for (i=0; i<n; i++){
5211     nz = adiag[i] - ai[i];
5212     bi[i+1] = bi[i] + nz;
5213     aj = a->j + ai[i];
5214     for (j=0; j<nz; j++){
5215       *bj = aj[j]; bj++;
5216     }
5217   }
5218 
5219   /* U part */
5220   bi_temp = bi[n];
5221   bdiag[n] = bi[n]-1;
5222   for (i=n-1; i>=0; i--){
5223     nz = ai[i+1] - adiag[i] - 1;
5224     bi_temp = bi_temp + nz + 1;
5225     aj = a->j + adiag[i] + 1;
5226     for (j=0; j<nz; j++){
5227       *bj = aj[j]; bj++;
5228     }
5229     /* diag[i] */
5230     *bj = i; bj++;
5231     bdiag[i] = bi_temp - 1;
5232   }
5233   PetscFunctionReturn(0);
5234 }
5235 
5236 #undef __FUNCT__
5237 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5238 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5239 {
5240   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5241   IS                 isicol;
5242   PetscErrorCode     ierr;
5243   const PetscInt     *r,*ic;
5244   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5245   PetscInt           *bi,*cols,nnz,*cols_lvl;
5246   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5247   PetscInt           i,levels,diagonal_fill;
5248   PetscTruth         col_identity,row_identity,both_identity;
5249   PetscReal          f;
5250   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5251   PetscBT            lnkbt;
5252   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5253   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5254   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5255   PetscTruth         missing;
5256   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5257 
5258   PetscFunctionBegin;
5259   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5260   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5261   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5262 
5263   f             = info->fill;
5264   levels        = (PetscInt)info->levels;
5265   diagonal_fill = (PetscInt)info->diagonal_fill;
5266   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5267 
5268   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5269   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5270   both_identity = (PetscTruth) (row_identity && col_identity);
5271 
5272   if (!levels && both_identity) {
5273     /* special case: ilu(0) with natural ordering */
5274     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5275     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5276 
5277     fact->factor = MAT_FACTOR_ILU;
5278     (fact)->info.factor_mallocs    = 0;
5279     (fact)->info.fill_ratio_given  = info->fill;
5280     (fact)->info.fill_ratio_needed = 1.0;
5281     b                = (Mat_SeqBAIJ*)(fact)->data;
5282     b->row           = isrow;
5283     b->col           = iscol;
5284     b->icol          = isicol;
5285     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5286     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5287     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5288     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5289     PetscFunctionReturn(0);
5290   }
5291 
5292   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5293   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5294 
5295   /* get new row pointers */
5296   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5297   bi[0] = 0;
5298   /* bdiag is location of diagonal in factor */
5299   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5300   bdiag[0]  = 0;
5301 
5302   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5303 
5304   /* create a linked list for storing column indices of the active row */
5305   nlnk = n + 1;
5306   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5307 
5308   /* initial FreeSpace size is f*(ai[n]+1) */
5309   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5310   current_space = free_space;
5311   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5312   current_space_lvl = free_space_lvl;
5313 
5314   for (i=0; i<n; i++) {
5315     nzi = 0;
5316     /* copy current row into linked list */
5317     nnz  = ai[r[i]+1] - ai[r[i]];
5318     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5319     cols = aj + ai[r[i]];
5320     lnk[i] = -1; /* marker to indicate if diagonal exists */
5321     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5322     nzi += nlnk;
5323 
5324     /* make sure diagonal entry is included */
5325     if (diagonal_fill && lnk[i] == -1) {
5326       fm = n;
5327       while (lnk[fm] < i) fm = lnk[fm];
5328       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5329       lnk[fm]    = i;
5330       lnk_lvl[i] = 0;
5331       nzi++; dcount++;
5332     }
5333 
5334     /* add pivot rows into the active row */
5335     nzbd = 0;
5336     prow = lnk[n];
5337     while (prow < i) {
5338       nnz      = bdiag[prow];
5339       cols     = bj_ptr[prow] + nnz + 1;
5340       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5341       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5342       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5343       nzi += nlnk;
5344       prow = lnk[prow];
5345       nzbd++;
5346     }
5347     bdiag[i] = nzbd;
5348     bi[i+1]  = bi[i] + nzi;
5349 
5350     /* if free space is not available, make more free space */
5351     if (current_space->local_remaining<nzi) {
5352       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5353       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5354       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5355       reallocs++;
5356     }
5357 
5358     /* copy data into free_space and free_space_lvl, then initialize lnk */
5359     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5360     bj_ptr[i]    = current_space->array;
5361     bjlvl_ptr[i] = current_space_lvl->array;
5362 
5363     /* make sure the active row i has diagonal entry */
5364     if (*(bj_ptr[i]+bdiag[i]) != i) {
5365       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5366     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5367     }
5368 
5369     current_space->array           += nzi;
5370     current_space->local_used      += nzi;
5371     current_space->local_remaining -= nzi;
5372     current_space_lvl->array           += nzi;
5373     current_space_lvl->local_used      += nzi;
5374     current_space_lvl->local_remaining -= nzi;
5375   }
5376 
5377   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5378   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5379 
5380   /* destroy list of free space and other temporary arrays */
5381   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5382 
5383   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5384   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5385 
5386   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5387   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5388   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5389 
5390 #if defined(PETSC_USE_INFO)
5391   {
5392     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5393     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5394     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5395     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5396     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5397     if (diagonal_fill) {
5398       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5399     }
5400   }
5401 #endif
5402 
5403   /* put together the new matrix */
5404   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5405   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5406   b = (Mat_SeqBAIJ*)(fact)->data;
5407   b->free_a       = PETSC_TRUE;
5408   b->free_ij      = PETSC_TRUE;
5409   b->singlemalloc = PETSC_FALSE;
5410   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5411   b->j          = bj;
5412   b->i          = bi;
5413   b->diag       = bdiag;
5414   b->free_diag  = PETSC_TRUE;
5415   b->ilen       = 0;
5416   b->imax       = 0;
5417   b->row        = isrow;
5418   b->col        = iscol;
5419   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5420   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5421   b->icol       = isicol;
5422   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5423   /* In b structure:  Free imax, ilen, old a, old j.
5424      Allocate bdiag, solve_work, new a, new j */
5425   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5426   b->maxnz = b->nz = bdiag[0]+1;
5427   fact->info.factor_mallocs    = reallocs;
5428   fact->info.fill_ratio_given  = f;
5429   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5430   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5431   PetscFunctionReturn(0);
5432 }
5433 
5434 
5435 /*
5436      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5437    except that the data structure of Mat_SeqAIJ is slightly different.
5438    Not a good example of code reuse.
5439 */
5440 #undef __FUNCT__
5441 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5442 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5443 {
5444   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5445   IS             isicol;
5446   PetscErrorCode ierr;
5447   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5448   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5449   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5450   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5451   PetscTruth     col_identity,row_identity,both_identity,flg;
5452   PetscReal      f;
5453   PetscTruth     newdatastruct = PETSC_FALSE;
5454 
5455   PetscFunctionBegin;
5456   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5457   if (newdatastruct){
5458     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5459     PetscFunctionReturn(0);
5460   }
5461 
5462   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5463   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5464 
5465   f             = info->fill;
5466   levels        = (PetscInt)info->levels;
5467   diagonal_fill = (PetscInt)info->diagonal_fill;
5468   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5469 
5470   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5471   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5472   both_identity = (PetscTruth) (row_identity && col_identity);
5473 
5474   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5475     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5476     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5477 
5478     fact->factor = MAT_FACTOR_ILU;
5479     b            = (Mat_SeqBAIJ*)fact->data;
5480     b->row       = isrow;
5481     b->col       = iscol;
5482     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5483     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5484     b->icol      = isicol;
5485     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5486     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5487     PetscFunctionReturn(0);
5488   }
5489 
5490   /* general case perform the symbolic factorization */
5491     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5492     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5493 
5494     /* get new row pointers */
5495     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5496     ainew[0] = 0;
5497     /* don't know how many column pointers are needed so estimate */
5498     jmax = (PetscInt)(f*ai[n] + 1);
5499     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5500     /* ajfill is level of fill for each fill entry */
5501     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5502     /* fill is a linked list of nonzeros in active row */
5503     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5504     /* im is level for each filled value */
5505     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5506     /* dloc is location of diagonal in factor */
5507     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5508     dloc[0]  = 0;
5509     for (prow=0; prow<n; prow++) {
5510 
5511       /* copy prow into linked list */
5512       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5513       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5514       xi         = aj + ai[r[prow]];
5515       fill[n]    = n;
5516       fill[prow] = -1; /* marker for diagonal entry */
5517       while (nz--) {
5518 	fm  = n;
5519 	idx = ic[*xi++];
5520 	do {
5521 	  m  = fm;
5522 	  fm = fill[m];
5523 	} while (fm < idx);
5524 	fill[m]   = idx;
5525 	fill[idx] = fm;
5526 	im[idx]   = 0;
5527       }
5528 
5529       /* make sure diagonal entry is included */
5530       if (diagonal_fill && fill[prow] == -1) {
5531 	fm = n;
5532 	while (fill[fm] < prow) fm = fill[fm];
5533 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5534 	fill[fm]   = prow;
5535 	im[prow]   = 0;
5536 	nzf++;
5537 	dcount++;
5538       }
5539 
5540       nzi = 0;
5541       row = fill[n];
5542       while (row < prow) {
5543 	incrlev = im[row] + 1;
5544 	nz      = dloc[row];
5545 	xi      = ajnew  + ainew[row] + nz + 1;
5546 	flev    = ajfill + ainew[row] + nz + 1;
5547 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5548 	fm      = row;
5549 	while (nnz-- > 0) {
5550 	  idx = *xi++;
5551 	  if (*flev + incrlev > levels) {
5552 	    flev++;
5553 	    continue;
5554 	  }
5555 	  do {
5556 	    m  = fm;
5557 	    fm = fill[m];
5558 	  } while (fm < idx);
5559 	  if (fm != idx) {
5560 	    im[idx]   = *flev + incrlev;
5561 	    fill[m]   = idx;
5562 	    fill[idx] = fm;
5563 	    fm        = idx;
5564 	    nzf++;
5565 	  } else {
5566 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5567 	  }
5568 	  flev++;
5569 	}
5570 	row = fill[row];
5571 	nzi++;
5572       }
5573       /* copy new filled row into permanent storage */
5574       ainew[prow+1] = ainew[prow] + nzf;
5575       if (ainew[prow+1] > jmax) {
5576 
5577 	/* estimate how much additional space we will need */
5578 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5579 	/* just double the memory each time */
5580 	PetscInt maxadd = jmax;
5581 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5582 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5583 	jmax += maxadd;
5584 
5585 	/* allocate a longer ajnew and ajfill */
5586 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5587 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5588 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5589 	ajnew = xitmp;
5590 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5591 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5592 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5593 	ajfill = xitmp;
5594 	reallocate++; /* count how many reallocations are needed */
5595       }
5596       xitmp       = ajnew + ainew[prow];
5597       flev        = ajfill + ainew[prow];
5598       dloc[prow]  = nzi;
5599       fm          = fill[n];
5600       while (nzf--) {
5601 	*xitmp++ = fm;
5602 	*flev++ = im[fm];
5603 	fm      = fill[fm];
5604       }
5605       /* make sure row has diagonal entry */
5606       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5607 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5608     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5609       }
5610     }
5611     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5612     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5613     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5614     ierr = PetscFree(fill);CHKERRQ(ierr);
5615     ierr = PetscFree(im);CHKERRQ(ierr);
5616 
5617 #if defined(PETSC_USE_INFO)
5618     {
5619       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5620       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5621       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5622       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5623       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5624       if (diagonal_fill) {
5625 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5626       }
5627     }
5628 #endif
5629 
5630     /* put together the new matrix */
5631     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5632     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5633     b    = (Mat_SeqBAIJ*)fact->data;
5634     b->free_a       = PETSC_TRUE;
5635     b->free_ij      = PETSC_TRUE;
5636     b->singlemalloc = PETSC_FALSE;
5637     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5638     b->j          = ajnew;
5639     b->i          = ainew;
5640     for (i=0; i<n; i++) dloc[i] += ainew[i];
5641     b->diag       = dloc;
5642     b->free_diag  = PETSC_TRUE;
5643     b->ilen       = 0;
5644     b->imax       = 0;
5645     b->row        = isrow;
5646     b->col        = iscol;
5647     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5648     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5649     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5650     b->icol       = isicol;
5651     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5652     /* In b structure:  Free imax, ilen, old a, old j.
5653        Allocate dloc, solve_work, new a, new j */
5654     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5655     b->maxnz          = b->nz = ainew[n];
5656 
5657     fact->info.factor_mallocs    = reallocate;
5658     fact->info.fill_ratio_given  = f;
5659     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5660 
5661   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5662   PetscFunctionReturn(0);
5663 }
5664 
5665 #undef __FUNCT__
5666 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5667 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5668 {
5669   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5670   /* int i,*AJ=a->j,nz=a->nz; */
5671   PetscFunctionBegin;
5672   /* Undo Column scaling */
5673 /*    while (nz--) { */
5674 /*      AJ[i] = AJ[i]/4; */
5675 /*    } */
5676   /* This should really invoke a push/pop logic, but we don't have that yet. */
5677   A->ops->setunfactored = PETSC_NULL;
5678   PetscFunctionReturn(0);
5679 }
5680 
5681 #undef __FUNCT__
5682 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5683 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5684 {
5685   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5686   PetscInt       *AJ=a->j,nz=a->nz;
5687   unsigned short *aj=(unsigned short *)AJ;
5688   PetscFunctionBegin;
5689   /* Is this really necessary? */
5690   while (nz--) {
5691     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5692   }
5693   A->ops->setunfactored = PETSC_NULL;
5694   PetscFunctionReturn(0);
5695 }
5696 
5697 
5698