xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 86ced885e2f032ee8a3c7c79d907273ce869a867)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 /* ----------------------------------------------------------- */
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
1179 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1180 {
1181   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1182   IS                iscol=a->col,isrow=a->row;
1183   PetscErrorCode    ierr;
1184   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1185   PetscInt          i,n=a->mbs,j;
1186   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1187   const MatScalar   *aa=a->a,*v;
1188   PetscScalar       *x,*t,*ls;
1189   const PetscScalar *b;
1190   PetscFunctionBegin;
1191   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1193   t    = a->solve_work;
1194 
1195   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1196   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1197 
1198   /* copy the b into temp work space according to permutation */
1199   for (i=0; i<n; i++) {
1200     for (j=0; j<bs; j++) {
1201       t[i*bs+j] = b[c[i]*bs+j];
1202     }
1203   }
1204 
1205 
1206   /* forward solve the upper triangular transpose */
1207   ls = a->solve_work + A->cmap->n;
1208   for (i=0; i<n; i++){
1209     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1210     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1211     v   = aa + bs2*(a->diag[i] + 1);
1212     vi  = aj + a->diag[i] + 1;
1213     nz  = ai[i+1] - a->diag[i] - 1;
1214     while (nz--) {
1215       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1216       v += bs2;
1217     }
1218   }
1219 
1220   /* backward solve the lower triangular transpose */
1221   for (i=n-1; i>=0; i--) {
1222     v   = aa + bs2*ai[i];
1223     vi  = aj + ai[i];
1224     nz  = a->diag[i] - ai[i];
1225     while (nz--) {
1226       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1227       v += bs2;
1228     }
1229   }
1230 
1231   /* copy t into x according to permutation */
1232   for (i=0; i<n; i++) {
1233     for (j=0; j<bs; j++) {
1234       x[bs*r[i]+j]   = t[bs*i+j];
1235     }
1236   }
1237 
1238   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1239   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1241   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1242   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNCT__
1247 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1248 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1249 {
1250   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1251   IS             iscol=a->col,isrow=a->row;
1252   PetscErrorCode ierr;
1253   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1254   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1255   MatScalar      *aa=a->a,*v;
1256   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1257   PetscScalar    *x,*b,*t;
1258 
1259   PetscFunctionBegin;
1260   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1261   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1262   t  = a->solve_work;
1263 
1264   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1265   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1266 
1267   /* forward solve the lower triangular */
1268   idx    = 7*(*r++);
1269   t[0] = b[idx];   t[1] = b[1+idx];
1270   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1271   t[5] = b[5+idx]; t[6] = b[6+idx];
1272 
1273   for (i=1; i<n; i++) {
1274     v     = aa + 49*ai[i];
1275     vi    = aj + ai[i];
1276     nz    = diag[i] - ai[i];
1277     idx   = 7*(*r++);
1278     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1279     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1280     while (nz--) {
1281       idx   = 7*(*vi++);
1282       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1283       x4    = t[3+idx];x5 = t[4+idx];
1284       x6    = t[5+idx];x7 = t[6+idx];
1285       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1286       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1287       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1288       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1289       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1290       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1291       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1292       v += 49;
1293     }
1294     idx = 7*i;
1295     t[idx]   = s1;t[1+idx] = s2;
1296     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1297     t[5+idx] = s6;t[6+idx] = s7;
1298   }
1299   /* backward solve the upper triangular */
1300   for (i=n-1; i>=0; i--){
1301     v    = aa + 49*diag[i] + 49;
1302     vi   = aj + diag[i] + 1;
1303     nz   = ai[i+1] - diag[i] - 1;
1304     idt  = 7*i;
1305     s1 = t[idt];  s2 = t[1+idt];
1306     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1307     s6 = t[5+idt];s7 = t[6+idt];
1308     while (nz--) {
1309       idx   = 7*(*vi++);
1310       x1    = t[idx];   x2 = t[1+idx];
1311       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1312       x6    = t[5+idx]; x7 = t[6+idx];
1313       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1314       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1315       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1316       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1317       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1318       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1319       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1320       v += 49;
1321     }
1322     idc = 7*(*c--);
1323     v   = aa + 49*diag[i];
1324     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1325                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1326     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1327                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1328     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1329                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1330     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1331                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1332     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1333                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1334     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1335                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1336     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1337                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1338   }
1339 
1340   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1341   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1342   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1343   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1344   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
1349 #undef __FUNCT__
1350 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1351 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1352 {
1353   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1354   IS             iscol=a->col,isrow=a->row;
1355   PetscErrorCode ierr;
1356   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1357   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1358   MatScalar      *aa=a->a,*v;
1359   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1360   PetscScalar    *x,*b,*t;
1361 
1362   PetscFunctionBegin;
1363   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1364   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1365   t  = a->solve_work;
1366 
1367   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1368   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1369 
1370   /* forward solve the lower triangular */
1371   idx    = 7*r[0];
1372   t[0] = b[idx];   t[1] = b[1+idx];
1373   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1374   t[5] = b[5+idx]; t[6] = b[6+idx];
1375 
1376   for (i=1; i<n; i++) {
1377     v     = aa + 49*ai[i];
1378     vi    = aj + ai[i];
1379     nz    = ai[i+1] - ai[i];
1380     idx   = 7*r[i];
1381     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1382     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1383     for(m=0;m<nz;m++){
1384       idx   = 7*vi[m];
1385       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1386       x4    = t[3+idx];x5 = t[4+idx];
1387       x6    = t[5+idx];x7 = t[6+idx];
1388       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1389       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1390       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1391       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1392       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1393       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1394       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1395       v += 49;
1396     }
1397     idx = 7*i;
1398     t[idx]   = s1;t[1+idx] = s2;
1399     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1400     t[5+idx] = s6;t[6+idx] = s7;
1401   }
1402   /* backward solve the upper triangular */
1403   for (i=n-1; i>=0; i--){
1404     k    = 2*n-i;
1405     v    = aa + 49*ai[k];
1406     vi   = aj + ai[k];
1407     nz   = ai[k+1] - ai[k] - 1;
1408     idt  = 7*i;
1409     s1 = t[idt];  s2 = t[1+idt];
1410     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1411     s6 = t[5+idt];s7 = t[6+idt];
1412     for(m=0;m<nz;m++){
1413       idx   = 7*vi[m];
1414       x1    = t[idx];   x2 = t[1+idx];
1415       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1416       x6    = t[5+idx]; x7 = t[6+idx];
1417       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1418       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1419       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1420       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1421       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1422       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1423       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1424       v += 49;
1425     }
1426     idc = 7*c[i];
1427     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1428                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1429     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1430                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1431     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1432                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1433     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1434                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1435     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1436                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1437     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1438                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1439     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1440                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1441   }
1442 
1443   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1444   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1445   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1447   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1448   PetscFunctionReturn(0);
1449 }
1450 #endif
1451 
1452 #undef __FUNCT__
1453 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1454 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1455 {
1456   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1457   IS             iscol=a->col,isrow=a->row;
1458   PetscErrorCode ierr;
1459   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1460   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1461   MatScalar      *aa=a->a,*v;
1462   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1463   PetscScalar    *x,*b,*t;
1464 
1465   PetscFunctionBegin;
1466   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1467   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1468   t  = a->solve_work;
1469 
1470   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1471   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1472 
1473   /* forward solve the lower triangular */
1474   idx    = 7*r[0];
1475   t[0] = b[idx];   t[1] = b[1+idx];
1476   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1477   t[5] = b[5+idx]; t[6] = b[6+idx];
1478 
1479   for (i=1; i<n; i++) {
1480     v     = aa + 49*ai[i];
1481     vi    = aj + ai[i];
1482     nz    = ai[i+1] - ai[i];
1483     idx   = 7*r[i];
1484     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1485     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1486     for(m=0;m<nz;m++){
1487       idx   = 7*vi[m];
1488       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1489       x4    = t[3+idx];x5 = t[4+idx];
1490       x6    = t[5+idx];x7 = t[6+idx];
1491       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1492       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1493       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1494       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1495       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1496       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1497       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1498       v += 49;
1499     }
1500     idx = 7*i;
1501     t[idx]   = s1;t[1+idx] = s2;
1502     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1503     t[5+idx] = s6;t[6+idx] = s7;
1504   }
1505   /* backward solve the upper triangular */
1506   for (i=n-1; i>=0; i--){
1507     v    = aa + 49*(adiag[i+1]+1);
1508     vi   = aj + adiag[i+1]+1;
1509     nz   = adiag[i] - adiag[i+1] - 1;
1510     idt  = 7*i;
1511     s1 = t[idt];  s2 = t[1+idt];
1512     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1513     s6 = t[5+idt];s7 = t[6+idt];
1514     for(m=0;m<nz;m++){
1515       idx   = 7*vi[m];
1516       x1    = t[idx];   x2 = t[1+idx];
1517       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1518       x6    = t[5+idx]; x7 = t[6+idx];
1519       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1520       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1521       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1522       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1523       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1524       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1525       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1526       v += 49;
1527     }
1528     idc = 7*c[i];
1529     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1530                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1531     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1532                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1533     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1534                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1535     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1536                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1537     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1538                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1539     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1540                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1541     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1542                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1543   }
1544 
1545   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1546   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1547   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1548   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1549   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 #undef __FUNCT__
1554 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1555 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1556 {
1557   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1558   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1559   PetscErrorCode    ierr;
1560   PetscInt          *diag = a->diag,jdx;
1561   const MatScalar   *aa=a->a,*v;
1562   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1563   const PetscScalar *b;
1564 
1565   PetscFunctionBegin;
1566   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1567   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1568   /* forward solve the lower triangular */
1569   idx    = 0;
1570   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1571   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1572   x[6] = b[6+idx];
1573   for (i=1; i<n; i++) {
1574     v     =  aa + 49*ai[i];
1575     vi    =  aj + ai[i];
1576     nz    =  diag[i] - ai[i];
1577     idx   =  7*i;
1578     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1579     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1580     s7  =  b[6+idx];
1581     while (nz--) {
1582       jdx   = 7*(*vi++);
1583       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1584       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1585       x7    = x[6+jdx];
1586       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1587       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1588       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1589       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1590       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1591       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1592       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1593       v += 49;
1594      }
1595     x[idx]   = s1;
1596     x[1+idx] = s2;
1597     x[2+idx] = s3;
1598     x[3+idx] = s4;
1599     x[4+idx] = s5;
1600     x[5+idx] = s6;
1601     x[6+idx] = s7;
1602   }
1603   /* backward solve the upper triangular */
1604   for (i=n-1; i>=0; i--){
1605     v    = aa + 49*diag[i] + 49;
1606     vi   = aj + diag[i] + 1;
1607     nz   = ai[i+1] - diag[i] - 1;
1608     idt  = 7*i;
1609     s1 = x[idt];   s2 = x[1+idt];
1610     s3 = x[2+idt]; s4 = x[3+idt];
1611     s5 = x[4+idt]; s6 = x[5+idt];
1612     s7 = x[6+idt];
1613     while (nz--) {
1614       idx   = 7*(*vi++);
1615       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1616       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1617       x7    = x[6+idx];
1618       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1619       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1620       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1621       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1622       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1623       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1624       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1625       v += 49;
1626     }
1627     v        = aa + 49*diag[i];
1628     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1629                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1630     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1631                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1632     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1633                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1634     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1635                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1636     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1637                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1638     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1639                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1640     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1641                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1642   }
1643 
1644   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1645   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1646   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
1651 #undef __FUNCT__
1652 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1653 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1654 {
1655     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1656     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1657     PetscErrorCode    ierr;
1658     PetscInt          idx,jdx,idt;
1659     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1660     const MatScalar   *aa=a->a,*v;
1661     PetscScalar       *x;
1662     const PetscScalar *b;
1663     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1664 
1665     PetscFunctionBegin;
1666     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1667     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668     /* forward solve the lower triangular */
1669     idx    = 0;
1670     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1671     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1672     for (i=1; i<n; i++) {
1673        v    = aa + bs2*ai[i];
1674        vi   = aj + ai[i];
1675        nz   = ai[i+1] - ai[i];
1676       idx   = bs*i;
1677        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1678        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1679        for(k=0;k<nz;k++) {
1680           jdx   = bs*vi[k];
1681           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1682 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1683           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1684           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1685           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1686 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1687           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1688 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1689 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1690           v   +=  bs2;
1691         }
1692 
1693        x[idx]   = s1;
1694        x[1+idx] = s2;
1695        x[2+idx] = s3;
1696        x[3+idx] = s4;
1697        x[4+idx] = s5;
1698        x[5+idx] = s6;
1699        x[6+idx] = s7;
1700     }
1701 
1702    /* backward solve the upper triangular */
1703   for (i=n-1; i>=0; i--){
1704      v   = aa + bs2*ai[2*n-i];
1705      vi  = aj + ai[2*n-i];
1706      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1707      idt = bs*i;
1708      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1709      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1710     for(k=0;k<nz;k++) {
1711       idx   = bs*vi[k];
1712        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1713        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1714        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1715        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1716        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1717        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1718        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1719        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1720        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1721         v   +=  bs2;
1722     }
1723     /* x = inv_diagonal*x */
1724     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1725     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1726     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1727     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1728     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1729     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1730     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1731   }
1732 
1733   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1734   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1735   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1736   PetscFunctionReturn(0);
1737 }
1738 #endif
1739 
1740 #undef __FUNCT__
1741 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1742 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1743 {
1744     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1745     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1746     PetscErrorCode    ierr;
1747     PetscInt          idx,jdx,idt;
1748     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1749     const MatScalar   *aa=a->a,*v;
1750     PetscScalar       *x;
1751     const PetscScalar *b;
1752     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1753 
1754     PetscFunctionBegin;
1755     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1756     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1757     /* forward solve the lower triangular */
1758     idx    = 0;
1759     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1760     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1761     for (i=1; i<n; i++) {
1762        v    = aa + bs2*ai[i];
1763        vi   = aj + ai[i];
1764        nz   = ai[i+1] - ai[i];
1765       idx   = bs*i;
1766        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1767        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1768        for(k=0;k<nz;k++) {
1769           jdx   = bs*vi[k];
1770           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1771 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1772           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1773           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1774           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1775 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1776           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1777 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1778 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1779           v   +=  bs2;
1780         }
1781 
1782        x[idx]   = s1;
1783        x[1+idx] = s2;
1784        x[2+idx] = s3;
1785        x[3+idx] = s4;
1786        x[4+idx] = s5;
1787        x[5+idx] = s6;
1788        x[6+idx] = s7;
1789     }
1790 
1791    /* backward solve the upper triangular */
1792   for (i=n-1; i>=0; i--){
1793     v   = aa + bs2*(adiag[i+1]+1);
1794      vi  = aj + adiag[i+1]+1;
1795      nz  = adiag[i] - adiag[i+1]-1;
1796      idt = bs*i;
1797      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1798      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1799     for(k=0;k<nz;k++) {
1800       idx   = bs*vi[k];
1801        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1802        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1803        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1804        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1805        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1806        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1807        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1808        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1809        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1810         v   +=  bs2;
1811     }
1812     /* x = inv_diagonal*x */
1813     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1814     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1815     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1816     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1817     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1818     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1819     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1820   }
1821 
1822   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1823   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1824   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 #undef __FUNCT__
1829 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1830 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1831 {
1832   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1833   IS                iscol=a->col,isrow=a->row;
1834   PetscErrorCode    ierr;
1835   const PetscInt    *r,*c,*rout,*cout;
1836   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1837   const MatScalar   *aa=a->a,*v;
1838   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1839   const PetscScalar *b;
1840   PetscFunctionBegin;
1841   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1842   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1843   t  = a->solve_work;
1844 
1845   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1846   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1847 
1848   /* forward solve the lower triangular */
1849   idx    = 6*(*r++);
1850   t[0] = b[idx];   t[1] = b[1+idx];
1851   t[2] = b[2+idx]; t[3] = b[3+idx];
1852   t[4] = b[4+idx]; t[5] = b[5+idx];
1853   for (i=1; i<n; i++) {
1854     v     = aa + 36*ai[i];
1855     vi    = aj + ai[i];
1856     nz    = diag[i] - ai[i];
1857     idx   = 6*(*r++);
1858     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1859     s5  = b[4+idx]; s6 = b[5+idx];
1860     while (nz--) {
1861       idx   = 6*(*vi++);
1862       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1863       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1864       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1865       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1866       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1867       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1868       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1869       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1870       v += 36;
1871     }
1872     idx = 6*i;
1873     t[idx]   = s1;t[1+idx] = s2;
1874     t[2+idx] = s3;t[3+idx] = s4;
1875     t[4+idx] = s5;t[5+idx] = s6;
1876   }
1877   /* backward solve the upper triangular */
1878   for (i=n-1; i>=0; i--){
1879     v    = aa + 36*diag[i] + 36;
1880     vi   = aj + diag[i] + 1;
1881     nz   = ai[i+1] - diag[i] - 1;
1882     idt  = 6*i;
1883     s1 = t[idt];  s2 = t[1+idt];
1884     s3 = t[2+idt];s4 = t[3+idt];
1885     s5 = t[4+idt];s6 = t[5+idt];
1886     while (nz--) {
1887       idx   = 6*(*vi++);
1888       x1    = t[idx];   x2 = t[1+idx];
1889       x3    = t[2+idx]; x4 = t[3+idx];
1890       x5    = t[4+idx]; x6 = t[5+idx];
1891       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1892       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1893       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1894       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1895       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1896       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1897       v += 36;
1898     }
1899     idc = 6*(*c--);
1900     v   = aa + 36*diag[i];
1901     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1902                                  v[18]*s4+v[24]*s5+v[30]*s6;
1903     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1904                                  v[19]*s4+v[25]*s5+v[31]*s6;
1905     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1906                                  v[20]*s4+v[26]*s5+v[32]*s6;
1907     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1908                                  v[21]*s4+v[27]*s5+v[33]*s6;
1909     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1910                                  v[22]*s4+v[28]*s5+v[34]*s6;
1911     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1912                                  v[23]*s4+v[29]*s5+v[35]*s6;
1913   }
1914 
1915   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1916   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1917   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1918   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1919   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
1924 #undef __FUNCT__
1925 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1926 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1927 {
1928   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1929   IS                iscol=a->col,isrow=a->row;
1930   PetscErrorCode    ierr;
1931   const PetscInt    *r,*c,*rout,*cout;
1932   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1933   const MatScalar   *aa=a->a,*v;
1934   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1935   const PetscScalar *b;
1936   PetscFunctionBegin;
1937   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1938   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1939   t  = a->solve_work;
1940 
1941   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1942   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1943 
1944   /* forward solve the lower triangular */
1945   idx    = 6*r[0];
1946   t[0] = b[idx];   t[1] = b[1+idx];
1947   t[2] = b[2+idx]; t[3] = b[3+idx];
1948   t[4] = b[4+idx]; t[5] = b[5+idx];
1949   for (i=1; i<n; i++) {
1950     v     = aa + 36*ai[i];
1951     vi    = aj + ai[i];
1952     nz    = ai[i+1] - ai[i];
1953     idx   = 6*r[i];
1954     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1955     s5  = b[4+idx]; s6 = b[5+idx];
1956     for(m=0;m<nz;m++){
1957       idx   = 6*vi[m];
1958       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1959       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1960       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1961       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1962       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1963       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1964       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1965       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1966       v += 36;
1967     }
1968     idx = 6*i;
1969     t[idx]   = s1;t[1+idx] = s2;
1970     t[2+idx] = s3;t[3+idx] = s4;
1971     t[4+idx] = s5;t[5+idx] = s6;
1972   }
1973   /* backward solve the upper triangular */
1974   for (i=n-1; i>=0; i--){
1975     k    = 2*n-i;
1976     v    = aa + 36*ai[k];
1977     vi   = aj + ai[k];
1978     nz   = ai[k+1] - ai[k] - 1;
1979     idt  = 6*i;
1980     s1 = t[idt];  s2 = t[1+idt];
1981     s3 = t[2+idt];s4 = t[3+idt];
1982     s5 = t[4+idt];s6 = t[5+idt];
1983     for(m=0;m<nz;m++){
1984       idx   = 6*vi[m];
1985       x1    = t[idx];   x2 = t[1+idx];
1986       x3    = t[2+idx]; x4 = t[3+idx];
1987       x5    = t[4+idx]; x6 = t[5+idx];
1988       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1989       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1990       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1991       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1992       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1993       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1994       v += 36;
1995     }
1996     idc = 6*c[i];
1997     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1998                                  v[18]*s4+v[24]*s5+v[30]*s6;
1999     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2000                                  v[19]*s4+v[25]*s5+v[31]*s6;
2001     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2002                                  v[20]*s4+v[26]*s5+v[32]*s6;
2003     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2004                                  v[21]*s4+v[27]*s5+v[33]*s6;
2005     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2006                                  v[22]*s4+v[28]*s5+v[34]*s6;
2007     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2008                                  v[23]*s4+v[29]*s5+v[35]*s6;
2009   }
2010 
2011   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2012   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2013   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2014   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2015   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2016   PetscFunctionReturn(0);
2017 }
2018 #endif
2019 
2020 #undef __FUNCT__
2021 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2022 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
2023 {
2024   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2025   IS                iscol=a->col,isrow=a->row;
2026   PetscErrorCode    ierr;
2027   const PetscInt    *r,*c,*rout,*cout;
2028   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2029   const MatScalar   *aa=a->a,*v;
2030   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2031   const PetscScalar *b;
2032   PetscFunctionBegin;
2033   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2035   t  = a->solve_work;
2036 
2037   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2038   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2039 
2040   /* forward solve the lower triangular */
2041   idx    = 6*r[0];
2042   t[0] = b[idx];   t[1] = b[1+idx];
2043   t[2] = b[2+idx]; t[3] = b[3+idx];
2044   t[4] = b[4+idx]; t[5] = b[5+idx];
2045   for (i=1; i<n; i++) {
2046     v     = aa + 36*ai[i];
2047     vi    = aj + ai[i];
2048     nz    = ai[i+1] - ai[i];
2049     idx   = 6*r[i];
2050     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2051     s5  = b[4+idx]; s6 = b[5+idx];
2052     for(m=0;m<nz;m++){
2053       idx   = 6*vi[m];
2054       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2055       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2056       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2057       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2058       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2059       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2060       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2061       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2062       v += 36;
2063     }
2064     idx = 6*i;
2065     t[idx]   = s1;t[1+idx] = s2;
2066     t[2+idx] = s3;t[3+idx] = s4;
2067     t[4+idx] = s5;t[5+idx] = s6;
2068   }
2069   /* backward solve the upper triangular */
2070   for (i=n-1; i>=0; i--){
2071     v    = aa + 36*(adiag[i+1]+1);
2072     vi   = aj + adiag[i+1]+1;
2073     nz   = adiag[i] - adiag[i+1] - 1;
2074     idt  = 6*i;
2075     s1 = t[idt];  s2 = t[1+idt];
2076     s3 = t[2+idt];s4 = t[3+idt];
2077     s5 = t[4+idt];s6 = t[5+idt];
2078     for(m=0;m<nz;m++){
2079       idx   = 6*vi[m];
2080       x1    = t[idx];   x2 = t[1+idx];
2081       x3    = t[2+idx]; x4 = t[3+idx];
2082       x5    = t[4+idx]; x6 = t[5+idx];
2083       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2084       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2085       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2086       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2087       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2088       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2089       v += 36;
2090     }
2091     idc = 6*c[i];
2092     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2093                                  v[18]*s4+v[24]*s5+v[30]*s6;
2094     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2095                                  v[19]*s4+v[25]*s5+v[31]*s6;
2096     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2097                                  v[20]*s4+v[26]*s5+v[32]*s6;
2098     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2099                                  v[21]*s4+v[27]*s5+v[33]*s6;
2100     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2101                                  v[22]*s4+v[28]*s5+v[34]*s6;
2102     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2103                                  v[23]*s4+v[29]*s5+v[35]*s6;
2104   }
2105 
2106   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2107   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2108   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2109   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2110   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2111   PetscFunctionReturn(0);
2112 }
2113 
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2116 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2119   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2120   PetscErrorCode    ierr;
2121   PetscInt          *diag = a->diag,jdx;
2122   const MatScalar   *aa=a->a,*v;
2123   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2124   const PetscScalar *b;
2125 
2126   PetscFunctionBegin;
2127   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2128   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2129   /* forward solve the lower triangular */
2130   idx    = 0;
2131   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2132   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2133   for (i=1; i<n; i++) {
2134     v     =  aa + 36*ai[i];
2135     vi    =  aj + ai[i];
2136     nz    =  diag[i] - ai[i];
2137     idx   =  6*i;
2138     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2139     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2140     while (nz--) {
2141       jdx   = 6*(*vi++);
2142       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2143       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2144       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2145       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2146       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2147       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2148       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2149       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2150       v += 36;
2151      }
2152     x[idx]   = s1;
2153     x[1+idx] = s2;
2154     x[2+idx] = s3;
2155     x[3+idx] = s4;
2156     x[4+idx] = s5;
2157     x[5+idx] = s6;
2158   }
2159   /* backward solve the upper triangular */
2160   for (i=n-1; i>=0; i--){
2161     v    = aa + 36*diag[i] + 36;
2162     vi   = aj + diag[i] + 1;
2163     nz   = ai[i+1] - diag[i] - 1;
2164     idt  = 6*i;
2165     s1 = x[idt];   s2 = x[1+idt];
2166     s3 = x[2+idt]; s4 = x[3+idt];
2167     s5 = x[4+idt]; s6 = x[5+idt];
2168     while (nz--) {
2169       idx   = 6*(*vi++);
2170       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2171       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2172       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2173       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2174       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2175       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2176       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2177       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2178       v += 36;
2179     }
2180     v        = aa + 36*diag[i];
2181     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2182     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2183     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2184     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2185     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2186     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2187   }
2188 
2189   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2191   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
2196 #undef __FUNCT__
2197 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2198 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2199 {
2200     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2201     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2202     PetscErrorCode    ierr;
2203     PetscInt          idx,jdx,idt;
2204     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2205     const MatScalar   *aa=a->a,*v;
2206     PetscScalar       *x;
2207     const PetscScalar *b;
2208     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2209 
2210     PetscFunctionBegin;
2211     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2212     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2213     /* forward solve the lower triangular */
2214     idx    = 0;
2215     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2216     x[4] = b[4+idx];x[5] = b[5+idx];
2217     for (i=1; i<n; i++) {
2218        v    = aa + bs2*ai[i];
2219        vi   = aj + ai[i];
2220        nz   = ai[i+1] - ai[i];
2221       idx   = bs*i;
2222        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2223        s5   = b[4+idx];s6 = b[5+idx];
2224        for(k=0;k<nz;k++){
2225           jdx   = bs*vi[k];
2226           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2227 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2228           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2229           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2230           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2231 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2232           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2233 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2234           v   +=  bs2;
2235         }
2236 
2237        x[idx]   = s1;
2238        x[1+idx] = s2;
2239        x[2+idx] = s3;
2240        x[3+idx] = s4;
2241        x[4+idx] = s5;
2242        x[5+idx] = s6;
2243     }
2244 
2245    /* backward solve the upper triangular */
2246   for (i=n-1; i>=0; i--){
2247      v   = aa + bs2*ai[2*n-i];
2248      vi  = aj + ai[2*n-i];
2249      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2250      idt = bs*i;
2251      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2252      s5 = x[4+idt];s6 = x[5+idt];
2253      for(k=0;k<nz;k++){
2254       idx   = bs*vi[k];
2255        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2256        x5    = x[4+idx];x6 = x[5+idx];
2257        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2258        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2259        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2260        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2261        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2262        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2263         v   +=  bs2;
2264     }
2265     /* x = inv_diagonal*x */
2266    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2267    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2268    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2269    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2270    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2271    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2272   }
2273 
2274   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2275   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2276   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2277   PetscFunctionReturn(0);
2278 }
2279 #endif
2280 
2281 #undef __FUNCT__
2282 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2283 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2284 {
2285     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2286     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2287     PetscErrorCode    ierr;
2288     PetscInt          idx,jdx,idt;
2289     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2290     const MatScalar   *aa=a->a,*v;
2291     PetscScalar       *x;
2292     const PetscScalar *b;
2293     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2294 
2295     PetscFunctionBegin;
2296     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2297     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2298     /* forward solve the lower triangular */
2299     idx    = 0;
2300     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2301     x[4] = b[4+idx];x[5] = b[5+idx];
2302     for (i=1; i<n; i++) {
2303        v    = aa + bs2*ai[i];
2304        vi   = aj + ai[i];
2305        nz   = ai[i+1] - ai[i];
2306       idx   = bs*i;
2307        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2308        s5   = b[4+idx];s6 = b[5+idx];
2309        for(k=0;k<nz;k++){
2310           jdx   = bs*vi[k];
2311           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2312 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2313           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2314           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2315           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2316 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2317           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2318 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2319           v   +=  bs2;
2320         }
2321 
2322        x[idx]   = s1;
2323        x[1+idx] = s2;
2324        x[2+idx] = s3;
2325        x[3+idx] = s4;
2326        x[4+idx] = s5;
2327        x[5+idx] = s6;
2328     }
2329 
2330    /* backward solve the upper triangular */
2331   for (i=n-1; i>=0; i--){
2332     v   = aa + bs2*(adiag[i+1]+1);
2333      vi  = aj + adiag[i+1]+1;
2334      nz  = adiag[i] - adiag[i+1]-1;
2335      idt = bs*i;
2336      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2337      s5 = x[4+idt];s6 = x[5+idt];
2338      for(k=0;k<nz;k++){
2339       idx   = bs*vi[k];
2340        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2341        x5    = x[4+idx];x6 = x[5+idx];
2342        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2343        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2344        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2345        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2346        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2347        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2348         v   +=  bs2;
2349     }
2350     /* x = inv_diagonal*x */
2351    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2352    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2353    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2354    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2355    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2356    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2357   }
2358 
2359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2361   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2362   PetscFunctionReturn(0);
2363 }
2364 
2365 #undef __FUNCT__
2366 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2367 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2368 {
2369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2370   IS                iscol=a->col,isrow=a->row;
2371   PetscErrorCode    ierr;
2372   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2373   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2374   const MatScalar   *aa=a->a,*v;
2375   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2376   const PetscScalar *b;
2377 
2378   PetscFunctionBegin;
2379   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2381   t  = a->solve_work;
2382 
2383   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2384   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2385 
2386   /* forward solve the lower triangular */
2387   idx    = 5*(*r++);
2388   t[0] = b[idx];   t[1] = b[1+idx];
2389   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2390   for (i=1; i<n; i++) {
2391     v     = aa + 25*ai[i];
2392     vi    = aj + ai[i];
2393     nz    = diag[i] - ai[i];
2394     idx   = 5*(*r++);
2395     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2396     s5  = b[4+idx];
2397     while (nz--) {
2398       idx   = 5*(*vi++);
2399       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2400       x4    = t[3+idx];x5 = t[4+idx];
2401       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2402       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2403       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2404       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2405       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2406       v += 25;
2407     }
2408     idx = 5*i;
2409     t[idx]   = s1;t[1+idx] = s2;
2410     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2411   }
2412   /* backward solve the upper triangular */
2413   for (i=n-1; i>=0; i--){
2414     v    = aa + 25*diag[i] + 25;
2415     vi   = aj + diag[i] + 1;
2416     nz   = ai[i+1] - diag[i] - 1;
2417     idt  = 5*i;
2418     s1 = t[idt];  s2 = t[1+idt];
2419     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2420     while (nz--) {
2421       idx   = 5*(*vi++);
2422       x1    = t[idx];   x2 = t[1+idx];
2423       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2424       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2425       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2426       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2427       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2428       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2429       v += 25;
2430     }
2431     idc = 5*(*c--);
2432     v   = aa + 25*diag[i];
2433     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2434                                  v[15]*s4+v[20]*s5;
2435     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2436                                  v[16]*s4+v[21]*s5;
2437     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2438                                  v[17]*s4+v[22]*s5;
2439     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2440                                  v[18]*s4+v[23]*s5;
2441     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2442                                  v[19]*s4+v[24]*s5;
2443   }
2444 
2445   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2446   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2447   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2449   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2450   PetscFunctionReturn(0);
2451 }
2452 
2453 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
2454 #undef __FUNCT__
2455 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2456 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2457 {
2458   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2459   IS                iscol=a->col,isrow=a->row;
2460   PetscErrorCode    ierr;
2461   const PetscInt    *r,*c,*rout,*cout;
2462   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2463   const MatScalar   *aa=a->a,*v;
2464   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2465   const PetscScalar *b;
2466 
2467   PetscFunctionBegin;
2468   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2469   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2470   t  = a->solve_work;
2471 
2472   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2473   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2474 
2475   /* forward solve the lower triangular */
2476   idx    = 5*r[0];
2477   t[0] = b[idx];   t[1] = b[1+idx];
2478   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2479   for (i=1; i<n; i++) {
2480     v     = aa + 25*ai[i];
2481     vi    = aj + ai[i];
2482     nz    = ai[i+1] - ai[i];
2483     idx   = 5*r[i];
2484     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2485     s5  = b[4+idx];
2486     for(m=0;m<nz;m++){
2487       idx   = 5*vi[m];
2488       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2489       x4    = t[3+idx];x5 = t[4+idx];
2490       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2491       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2492       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2493       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2494       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2495       v += 25;
2496     }
2497     idx = 5*i;
2498     t[idx]   = s1;t[1+idx] = s2;
2499     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2500   }
2501   /* backward solve the upper triangular */
2502   for (i=n-1; i>=0; i--){
2503     k    = 2*n-i;
2504     v    = aa + 25*ai[k];
2505     vi   = aj + ai[k];
2506     nz   = ai[k+1] - ai[k] - 1;
2507     idt  = 5*i;
2508     s1 = t[idt];  s2 = t[1+idt];
2509     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2510     for(m=0;m<nz;m++){
2511       idx   = 5*vi[m];
2512       x1    = t[idx];   x2 = t[1+idx];
2513       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2514       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2515       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2516       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2517       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2518       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2519       v += 25;
2520     }
2521     idc = 5*c[i];
2522     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2523                                  v[15]*s4+v[20]*s5;
2524     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2525                                  v[16]*s4+v[21]*s5;
2526     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2527                                  v[17]*s4+v[22]*s5;
2528     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2529                                  v[18]*s4+v[23]*s5;
2530     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2531                                  v[19]*s4+v[24]*s5;
2532   }
2533 
2534   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2535   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2536   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2537   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2538   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2539   PetscFunctionReturn(0);
2540 }
2541 #endif
2542 
2543 #undef __FUNCT__
2544 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2545 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2546 {
2547   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2548   IS                iscol=a->col,isrow=a->row;
2549   PetscErrorCode    ierr;
2550   const PetscInt    *r,*c,*rout,*cout;
2551   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2552   const MatScalar   *aa=a->a,*v;
2553   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2554   const PetscScalar *b;
2555 
2556   PetscFunctionBegin;
2557   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2558   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2559   t  = a->solve_work;
2560 
2561   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2562   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2563 
2564   /* forward solve the lower triangular */
2565   idx    = 5*r[0];
2566   t[0] = b[idx];   t[1] = b[1+idx];
2567   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2568   for (i=1; i<n; i++) {
2569     v     = aa + 25*ai[i];
2570     vi    = aj + ai[i];
2571     nz    = ai[i+1] - ai[i];
2572     idx   = 5*r[i];
2573     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2574     s5  = b[4+idx];
2575     for(m=0;m<nz;m++){
2576       idx   = 5*vi[m];
2577       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2578       x4    = t[3+idx];x5 = t[4+idx];
2579       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2580       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2581       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2582       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2583       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2584       v += 25;
2585     }
2586     idx = 5*i;
2587     t[idx]   = s1;t[1+idx] = s2;
2588     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2589   }
2590   /* backward solve the upper triangular */
2591   for (i=n-1; i>=0; i--){
2592     v    = aa + 25*(adiag[i+1]+1);
2593     vi   = aj + adiag[i+1]+1;
2594     nz   = adiag[i] - adiag[i+1] - 1;
2595     idt  = 5*i;
2596     s1 = t[idt];  s2 = t[1+idt];
2597     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2598     for(m=0;m<nz;m++){
2599       idx   = 5*vi[m];
2600       x1    = t[idx];   x2 = t[1+idx];
2601       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2602       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2603       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2604       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2605       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2606       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2607       v += 25;
2608     }
2609     idc = 5*c[i];
2610     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2611                                  v[15]*s4+v[20]*s5;
2612     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2613                                  v[16]*s4+v[21]*s5;
2614     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2615                                  v[17]*s4+v[22]*s5;
2616     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2617                                  v[18]*s4+v[23]*s5;
2618     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2619                                  v[19]*s4+v[24]*s5;
2620   }
2621 
2622   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2623   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2624   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2625   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2626   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2627   PetscFunctionReturn(0);
2628 }
2629 
2630 #undef __FUNCT__
2631 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2632 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2633 {
2634   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2635   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2636   PetscErrorCode    ierr;
2637   PetscInt          *diag = a->diag,jdx;
2638   const MatScalar   *aa=a->a,*v;
2639   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2640   const PetscScalar *b;
2641 
2642   PetscFunctionBegin;
2643   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2644   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2645   /* forward solve the lower triangular */
2646   idx    = 0;
2647   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2648   for (i=1; i<n; i++) {
2649     v     =  aa + 25*ai[i];
2650     vi    =  aj + ai[i];
2651     nz    =  diag[i] - ai[i];
2652     idx   =  5*i;
2653     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2654     while (nz--) {
2655       jdx   = 5*(*vi++);
2656       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2657       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2658       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2659       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2660       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2661       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2662       v    += 25;
2663     }
2664     x[idx]   = s1;
2665     x[1+idx] = s2;
2666     x[2+idx] = s3;
2667     x[3+idx] = s4;
2668     x[4+idx] = s5;
2669   }
2670   /* backward solve the upper triangular */
2671   for (i=n-1; i>=0; i--){
2672     v    = aa + 25*diag[i] + 25;
2673     vi   = aj + diag[i] + 1;
2674     nz   = ai[i+1] - diag[i] - 1;
2675     idt  = 5*i;
2676     s1 = x[idt];  s2 = x[1+idt];
2677     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2678     while (nz--) {
2679       idx   = 5*(*vi++);
2680       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2681       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2682       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2683       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2684       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2685       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2686       v    += 25;
2687     }
2688     v        = aa + 25*diag[i];
2689     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2690     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2691     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2692     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2693     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2694   }
2695 
2696   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2698   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2699   PetscFunctionReturn(0);
2700 }
2701 
2702 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
2703 #undef __FUNCT__
2704 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2705 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2706 {
2707   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2708   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2709   PetscErrorCode    ierr;
2710   PetscInt          jdx;
2711   const MatScalar   *aa=a->a,*v;
2712   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2713   const PetscScalar *b;
2714 
2715   PetscFunctionBegin;
2716   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2717   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2718   /* forward solve the lower triangular */
2719   idx    = 0;
2720   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2721   for (i=1; i<n; i++) {
2722     v   = aa + 25*ai[i];
2723     vi  = aj + ai[i];
2724     nz  = ai[i+1] - ai[i];
2725     idx = 5*i;
2726     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2727     for(k=0;k<nz;k++) {
2728       jdx   = 5*vi[k];
2729       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2730       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2731       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2732       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2733       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2734       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2735       v    += 25;
2736     }
2737     x[idx]   = s1;
2738     x[1+idx] = s2;
2739     x[2+idx] = s3;
2740     x[3+idx] = s4;
2741     x[4+idx] = s5;
2742   }
2743 
2744   /* backward solve the upper triangular */
2745   for (i=n-1; i>=0; i--){
2746     v   = aa + 25*ai[2*n-i];
2747     vi  = aj + ai[2*n-i];
2748     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2749     idt = 5*i;
2750     s1 = x[idt];  s2 = x[1+idt];
2751     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2752     for(k=0;k<nz;k++){
2753       idx   = 5*vi[k];
2754       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2755       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2756       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2757       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2758       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2759       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2760       v    += 25;
2761     }
2762     /* x = inv_diagonal*x */
2763     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2764     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2765     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2766     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2767     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2768   }
2769 
2770   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2771   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2772   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2773   PetscFunctionReturn(0);
2774 }
2775 #endif
2776 
2777 #undef __FUNCT__
2778 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2779 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2780 {
2781   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2782   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2783   PetscErrorCode    ierr;
2784   PetscInt          jdx;
2785   const MatScalar   *aa=a->a,*v;
2786   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2787   const PetscScalar *b;
2788 
2789   PetscFunctionBegin;
2790   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2791   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2792   /* forward solve the lower triangular */
2793   idx    = 0;
2794   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2795   for (i=1; i<n; i++) {
2796     v   = aa + 25*ai[i];
2797     vi  = aj + ai[i];
2798     nz  = ai[i+1] - ai[i];
2799     idx = 5*i;
2800     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2801     for(k=0;k<nz;k++) {
2802       jdx   = 5*vi[k];
2803       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2804       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2805       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2806       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2807       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2808       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2809       v    += 25;
2810     }
2811     x[idx]   = s1;
2812     x[1+idx] = s2;
2813     x[2+idx] = s3;
2814     x[3+idx] = s4;
2815     x[4+idx] = s5;
2816   }
2817 
2818   /* backward solve the upper triangular */
2819   for (i=n-1; i>=0; i--){
2820     v   = aa + 25*(adiag[i+1]+1);
2821     vi  = aj + adiag[i+1]+1;
2822     nz  = adiag[i] - adiag[i+1]-1;
2823     idt = 5*i;
2824     s1 = x[idt];  s2 = x[1+idt];
2825     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2826     for(k=0;k<nz;k++){
2827       idx   = 5*vi[k];
2828       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2829       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2830       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2831       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2832       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2833       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2834       v    += 25;
2835     }
2836     /* x = inv_diagonal*x */
2837     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2838     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2839     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2840     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2841     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2842   }
2843 
2844   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2845   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2846   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2847   PetscFunctionReturn(0);
2848 }
2849 
2850 #undef __FUNCT__
2851 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2852 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2853 {
2854   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2855   IS                iscol=a->col,isrow=a->row;
2856   PetscErrorCode    ierr;
2857   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2858   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2859   const MatScalar   *aa=a->a,*v;
2860   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2861   const PetscScalar *b;
2862 
2863   PetscFunctionBegin;
2864   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2865   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2866   t  = a->solve_work;
2867 
2868   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2869   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2870 
2871   /* forward solve the lower triangular */
2872   idx    = 4*(*r++);
2873   t[0] = b[idx];   t[1] = b[1+idx];
2874   t[2] = b[2+idx]; t[3] = b[3+idx];
2875   for (i=1; i<n; i++) {
2876     v     = aa + 16*ai[i];
2877     vi    = aj + ai[i];
2878     nz    = diag[i] - ai[i];
2879     idx   = 4*(*r++);
2880     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2881     while (nz--) {
2882       idx   = 4*(*vi++);
2883       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2884       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2885       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2886       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2887       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2888       v    += 16;
2889     }
2890     idx        = 4*i;
2891     t[idx]   = s1;t[1+idx] = s2;
2892     t[2+idx] = s3;t[3+idx] = s4;
2893   }
2894   /* backward solve the upper triangular */
2895   for (i=n-1; i>=0; i--){
2896     v    = aa + 16*diag[i] + 16;
2897     vi   = aj + diag[i] + 1;
2898     nz   = ai[i+1] - diag[i] - 1;
2899     idt  = 4*i;
2900     s1 = t[idt];  s2 = t[1+idt];
2901     s3 = t[2+idt];s4 = t[3+idt];
2902     while (nz--) {
2903       idx   = 4*(*vi++);
2904       x1    = t[idx];   x2 = t[1+idx];
2905       x3    = t[2+idx]; x4 = t[3+idx];
2906       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2907       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2908       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2909       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2910       v += 16;
2911     }
2912     idc      = 4*(*c--);
2913     v        = aa + 16*diag[i];
2914     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2915     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2916     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2917     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2918   }
2919 
2920   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2921   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2922   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2923   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2924   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2925   PetscFunctionReturn(0);
2926 }
2927 
2928 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
2929 #undef __FUNCT__
2930 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2931 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2932 {
2933   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2934   IS                iscol=a->col,isrow=a->row;
2935   PetscErrorCode    ierr;
2936   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2937   const PetscInt    *r,*c,*rout,*cout;
2938   const MatScalar   *aa=a->a,*v;
2939   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2940   const PetscScalar *b;
2941 
2942   PetscFunctionBegin;
2943   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2944   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2945   t  = a->solve_work;
2946 
2947   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2948   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2949 
2950   /* forward solve the lower triangular */
2951   idx    = 4*r[0];
2952   t[0] = b[idx];   t[1] = b[1+idx];
2953   t[2] = b[2+idx]; t[3] = b[3+idx];
2954   for (i=1; i<n; i++) {
2955     v     = aa + 16*ai[i];
2956     vi    = aj + ai[i];
2957     nz    = ai[i+1] - ai[i];
2958     idx   = 4*r[i];
2959     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2960     for(m=0;m<nz;m++){
2961       idx   = 4*vi[m];
2962       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2963       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2964       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2965       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2966       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2967       v    += 16;
2968     }
2969     idx        = 4*i;
2970     t[idx]   = s1;t[1+idx] = s2;
2971     t[2+idx] = s3;t[3+idx] = s4;
2972   }
2973   /* backward solve the upper triangular */
2974   for (i=n-1; i>=0; i--){
2975     k    = 2*n-i;
2976     v    = aa + 16*ai[k];
2977     vi   = aj + ai[k];
2978     nz   = ai[k+1] - ai[k] - 1;
2979     idt  = 4*i;
2980     s1 = t[idt];  s2 = t[1+idt];
2981     s3 = t[2+idt];s4 = t[3+idt];
2982     for(m=0;m<nz;m++){
2983       idx   = 4*vi[m];
2984       x1    = t[idx];   x2 = t[1+idx];
2985       x3    = t[2+idx]; x4 = t[3+idx];
2986       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2987       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2988       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2989       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2990       v += 16;
2991     }
2992     idc      = 4*c[i];
2993     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2994     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2995     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2996     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2997   }
2998 
2999   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3000   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3001   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3002   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3003   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3004   PetscFunctionReturn(0);
3005 }
3006 #endif
3007 
3008 #undef __FUNCT__
3009 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
3010 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
3011 {
3012   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3013   IS                iscol=a->col,isrow=a->row;
3014   PetscErrorCode    ierr;
3015   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3016   const PetscInt    *r,*c,*rout,*cout;
3017   const MatScalar   *aa=a->a,*v;
3018   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3019   const PetscScalar *b;
3020 
3021   PetscFunctionBegin;
3022   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3023   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3024   t  = a->solve_work;
3025 
3026   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3027   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3028 
3029   /* forward solve the lower triangular */
3030   idx    = 4*r[0];
3031   t[0] = b[idx];   t[1] = b[1+idx];
3032   t[2] = b[2+idx]; t[3] = b[3+idx];
3033   for (i=1; i<n; i++) {
3034     v     = aa + 16*ai[i];
3035     vi    = aj + ai[i];
3036     nz    = ai[i+1] - ai[i];
3037     idx   = 4*r[i];
3038     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3039     for(m=0;m<nz;m++){
3040       idx   = 4*vi[m];
3041       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3042       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3043       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3044       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3045       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3046       v    += 16;
3047     }
3048     idx        = 4*i;
3049     t[idx]   = s1;t[1+idx] = s2;
3050     t[2+idx] = s3;t[3+idx] = s4;
3051   }
3052   /* backward solve the upper triangular */
3053   for (i=n-1; i>=0; i--){
3054     v    = aa + 16*(adiag[i+1]+1);
3055     vi   = aj + adiag[i+1]+1;
3056     nz   = adiag[i] - adiag[i+1] - 1;
3057     idt  = 4*i;
3058     s1 = t[idt];  s2 = t[1+idt];
3059     s3 = t[2+idt];s4 = t[3+idt];
3060     for(m=0;m<nz;m++){
3061       idx   = 4*vi[m];
3062       x1    = t[idx];   x2 = t[1+idx];
3063       x3    = t[2+idx]; x4 = t[3+idx];
3064       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3065       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3066       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3067       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3068       v += 16;
3069     }
3070     idc      = 4*c[i];
3071     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3072     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3073     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3074     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3075   }
3076 
3077   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3078   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3079   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3080   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3081   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3082   PetscFunctionReturn(0);
3083 }
3084 
3085 #undef __FUNCT__
3086 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3087 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3088 {
3089   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3090   IS                iscol=a->col,isrow=a->row;
3091   PetscErrorCode    ierr;
3092   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3093   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3094   const MatScalar   *aa=a->a,*v;
3095   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3096   PetscScalar       *x;
3097   const PetscScalar *b;
3098 
3099   PetscFunctionBegin;
3100   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3101   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3102   t  = (MatScalar *)a->solve_work;
3103 
3104   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3105   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3106 
3107   /* forward solve the lower triangular */
3108   idx    = 4*(*r++);
3109   t[0] = (MatScalar)b[idx];
3110   t[1] = (MatScalar)b[1+idx];
3111   t[2] = (MatScalar)b[2+idx];
3112   t[3] = (MatScalar)b[3+idx];
3113   for (i=1; i<n; i++) {
3114     v     = aa + 16*ai[i];
3115     vi    = aj + ai[i];
3116     nz    = diag[i] - ai[i];
3117     idx   = 4*(*r++);
3118     s1 = (MatScalar)b[idx];
3119     s2 = (MatScalar)b[1+idx];
3120     s3 = (MatScalar)b[2+idx];
3121     s4 = (MatScalar)b[3+idx];
3122     while (nz--) {
3123       idx   = 4*(*vi++);
3124       x1  = t[idx];
3125       x2  = t[1+idx];
3126       x3  = t[2+idx];
3127       x4  = t[3+idx];
3128       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3129       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3130       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3131       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3132       v    += 16;
3133     }
3134     idx        = 4*i;
3135     t[idx]   = s1;
3136     t[1+idx] = s2;
3137     t[2+idx] = s3;
3138     t[3+idx] = s4;
3139   }
3140   /* backward solve the upper triangular */
3141   for (i=n-1; i>=0; i--){
3142     v    = aa + 16*diag[i] + 16;
3143     vi   = aj + diag[i] + 1;
3144     nz   = ai[i+1] - diag[i] - 1;
3145     idt  = 4*i;
3146     s1 = t[idt];
3147     s2 = t[1+idt];
3148     s3 = t[2+idt];
3149     s4 = t[3+idt];
3150     while (nz--) {
3151       idx   = 4*(*vi++);
3152       x1  = t[idx];
3153       x2  = t[1+idx];
3154       x3  = t[2+idx];
3155       x4  = t[3+idx];
3156       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3157       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3158       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3159       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3160       v += 16;
3161     }
3162     idc      = 4*(*c--);
3163     v        = aa + 16*diag[i];
3164     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3165     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3166     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3167     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3168     x[idc]   = (PetscScalar)t[idt];
3169     x[1+idc] = (PetscScalar)t[1+idt];
3170     x[2+idc] = (PetscScalar)t[2+idt];
3171     x[3+idc] = (PetscScalar)t[3+idt];
3172  }
3173 
3174   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3175   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3176   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3177   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3178   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3179   PetscFunctionReturn(0);
3180 }
3181 
3182 #if defined (PETSC_HAVE_SSE)
3183 
3184 #include PETSC_HAVE_SSE
3185 
3186 #undef __FUNCT__
3187 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3188 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3189 {
3190   /*
3191      Note: This code uses demotion of double
3192      to float when performing the mixed-mode computation.
3193      This may not be numerically reasonable for all applications.
3194   */
3195   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3196   IS             iscol=a->col,isrow=a->row;
3197   PetscErrorCode ierr;
3198   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3199   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3200   MatScalar      *aa=a->a,*v;
3201   PetscScalar    *x,*b,*t;
3202 
3203   /* Make space in temp stack for 16 Byte Aligned arrays */
3204   float           ssealignedspace[11],*tmps,*tmpx;
3205   unsigned long   offset;
3206 
3207   PetscFunctionBegin;
3208   SSE_SCOPE_BEGIN;
3209 
3210     offset = (unsigned long)ssealignedspace % 16;
3211     if (offset) offset = (16 - offset)/4;
3212     tmps = &ssealignedspace[offset];
3213     tmpx = &ssealignedspace[offset+4];
3214     PREFETCH_NTA(aa+16*ai[1]);
3215 
3216     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3217     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3218     t  = a->solve_work;
3219 
3220     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3221     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3222 
3223     /* forward solve the lower triangular */
3224     idx  = 4*(*r++);
3225     t[0] = b[idx];   t[1] = b[1+idx];
3226     t[2] = b[2+idx]; t[3] = b[3+idx];
3227     v    =  aa + 16*ai[1];
3228 
3229     for (i=1; i<n;) {
3230       PREFETCH_NTA(&v[8]);
3231       vi   =  aj      + ai[i];
3232       nz   =  diag[i] - ai[i];
3233       idx  =  4*(*r++);
3234 
3235       /* Demote sum from double to float */
3236       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3237       LOAD_PS(tmps,XMM7);
3238 
3239       while (nz--) {
3240         PREFETCH_NTA(&v[16]);
3241         idx = 4*(*vi++);
3242 
3243         /* Demote solution (so far) from double to float */
3244         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3245 
3246         /* 4x4 Matrix-Vector product with negative accumulation: */
3247         SSE_INLINE_BEGIN_2(tmpx,v)
3248           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3249 
3250           /* First Column */
3251           SSE_COPY_PS(XMM0,XMM6)
3252           SSE_SHUFFLE(XMM0,XMM0,0x00)
3253           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3254           SSE_SUB_PS(XMM7,XMM0)
3255 
3256           /* Second Column */
3257           SSE_COPY_PS(XMM1,XMM6)
3258           SSE_SHUFFLE(XMM1,XMM1,0x55)
3259           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3260           SSE_SUB_PS(XMM7,XMM1)
3261 
3262           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3263 
3264           /* Third Column */
3265           SSE_COPY_PS(XMM2,XMM6)
3266           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3267           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3268           SSE_SUB_PS(XMM7,XMM2)
3269 
3270           /* Fourth Column */
3271           SSE_COPY_PS(XMM3,XMM6)
3272           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3273           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3274           SSE_SUB_PS(XMM7,XMM3)
3275         SSE_INLINE_END_2
3276 
3277         v  += 16;
3278       }
3279       idx = 4*i;
3280       v   = aa + 16*ai[++i];
3281       PREFETCH_NTA(v);
3282       STORE_PS(tmps,XMM7);
3283 
3284       /* Promote result from float to double */
3285       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3286     }
3287     /* backward solve the upper triangular */
3288     idt  = 4*(n-1);
3289     ai16 = 16*diag[n-1];
3290     v    = aa + ai16 + 16;
3291     for (i=n-1; i>=0;){
3292       PREFETCH_NTA(&v[8]);
3293       vi = aj + diag[i] + 1;
3294       nz = ai[i+1] - diag[i] - 1;
3295 
3296       /* Demote accumulator from double to float */
3297       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3298       LOAD_PS(tmps,XMM7);
3299 
3300       while (nz--) {
3301         PREFETCH_NTA(&v[16]);
3302         idx = 4*(*vi++);
3303 
3304         /* Demote solution (so far) from double to float */
3305         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3306 
3307         /* 4x4 Matrix-Vector Product with negative accumulation: */
3308         SSE_INLINE_BEGIN_2(tmpx,v)
3309           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3310 
3311           /* First Column */
3312           SSE_COPY_PS(XMM0,XMM6)
3313           SSE_SHUFFLE(XMM0,XMM0,0x00)
3314           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3315           SSE_SUB_PS(XMM7,XMM0)
3316 
3317           /* Second Column */
3318           SSE_COPY_PS(XMM1,XMM6)
3319           SSE_SHUFFLE(XMM1,XMM1,0x55)
3320           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3321           SSE_SUB_PS(XMM7,XMM1)
3322 
3323           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3324 
3325           /* Third Column */
3326           SSE_COPY_PS(XMM2,XMM6)
3327           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3328           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3329           SSE_SUB_PS(XMM7,XMM2)
3330 
3331           /* Fourth Column */
3332           SSE_COPY_PS(XMM3,XMM6)
3333           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3334           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3335           SSE_SUB_PS(XMM7,XMM3)
3336         SSE_INLINE_END_2
3337         v  += 16;
3338       }
3339       v    = aa + ai16;
3340       ai16 = 16*diag[--i];
3341       PREFETCH_NTA(aa+ai16+16);
3342       /*
3343          Scale the result by the diagonal 4x4 block,
3344          which was inverted as part of the factorization
3345       */
3346       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3347         /* First Column */
3348         SSE_COPY_PS(XMM0,XMM7)
3349         SSE_SHUFFLE(XMM0,XMM0,0x00)
3350         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3351 
3352         /* Second Column */
3353         SSE_COPY_PS(XMM1,XMM7)
3354         SSE_SHUFFLE(XMM1,XMM1,0x55)
3355         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3356         SSE_ADD_PS(XMM0,XMM1)
3357 
3358         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3359 
3360         /* Third Column */
3361         SSE_COPY_PS(XMM2,XMM7)
3362         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3363         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3364         SSE_ADD_PS(XMM0,XMM2)
3365 
3366         /* Fourth Column */
3367         SSE_COPY_PS(XMM3,XMM7)
3368         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3369         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3370         SSE_ADD_PS(XMM0,XMM3)
3371 
3372         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3373       SSE_INLINE_END_3
3374 
3375       /* Promote solution from float to double */
3376       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3377 
3378       /* Apply reordering to t and stream into x.    */
3379       /* This way, x doesn't pollute the cache.      */
3380       /* Be careful with size: 2 doubles = 4 floats! */
3381       idc  = 4*(*c--);
3382       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3383         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3384         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3385         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3386         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3387         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3388         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3389       SSE_INLINE_END_2
3390       v    = aa + ai16 + 16;
3391       idt -= 4;
3392     }
3393 
3394     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3395     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3396     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3397     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3398     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3399   SSE_SCOPE_END;
3400   PetscFunctionReturn(0);
3401 }
3402 
3403 #endif
3404 
3405 
3406 /*
3407       Special case where the matrix was ILU(0) factored in the natural
3408    ordering. This eliminates the need for the column and row permutation.
3409 */
3410 #undef __FUNCT__
3411 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3412 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3413 {
3414   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3415   PetscInt          n=a->mbs;
3416   const PetscInt    *ai=a->i,*aj=a->j;
3417   PetscErrorCode    ierr;
3418   const PetscInt    *diag = a->diag;
3419   const MatScalar   *aa=a->a;
3420   PetscScalar       *x;
3421   const PetscScalar *b;
3422 
3423   PetscFunctionBegin;
3424   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3425   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3426 
3427 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3428   {
3429     static PetscScalar w[2000]; /* very BAD need to fix */
3430     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3431   }
3432 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3433   {
3434     static PetscScalar w[2000]; /* very BAD need to fix */
3435     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3436   }
3437 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3438   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3439 #else
3440   {
3441     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3442     const MatScalar *v;
3443     PetscInt        jdx,idt,idx,nz,i,ai16;
3444     const PetscInt  *vi;
3445 
3446   /* forward solve the lower triangular */
3447   idx    = 0;
3448   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3449   for (i=1; i<n; i++) {
3450     v     =  aa      + 16*ai[i];
3451     vi    =  aj      + ai[i];
3452     nz    =  diag[i] - ai[i];
3453     idx   +=  4;
3454     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3455     while (nz--) {
3456       jdx   = 4*(*vi++);
3457       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3458       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3459       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3460       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3461       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3462       v    += 16;
3463     }
3464     x[idx]   = s1;
3465     x[1+idx] = s2;
3466     x[2+idx] = s3;
3467     x[3+idx] = s4;
3468   }
3469   /* backward solve the upper triangular */
3470   idt = 4*(n-1);
3471   for (i=n-1; i>=0; i--){
3472     ai16 = 16*diag[i];
3473     v    = aa + ai16 + 16;
3474     vi   = aj + diag[i] + 1;
3475     nz   = ai[i+1] - diag[i] - 1;
3476     s1 = x[idt];  s2 = x[1+idt];
3477     s3 = x[2+idt];s4 = x[3+idt];
3478     while (nz--) {
3479       idx   = 4*(*vi++);
3480       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3481       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3482       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3483       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3484       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3485       v    += 16;
3486     }
3487     v        = aa + ai16;
3488     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3489     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3490     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3491     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3492     idt -= 4;
3493   }
3494   }
3495 #endif
3496 
3497   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3498   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3499   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3500   PetscFunctionReturn(0);
3501 }
3502 
3503 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
3504 #undef __FUNCT__
3505 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3506 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3507 {
3508     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3509     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3510     PetscErrorCode    ierr;
3511     PetscInt          idx,jdx,idt;
3512     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3513     const MatScalar   *aa=a->a,*v;
3514     PetscScalar       *x;
3515     const PetscScalar *b;
3516     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3517 
3518     PetscFunctionBegin;
3519     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3520     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3521     /* forward solve the lower triangular */
3522     idx    = 0;
3523     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3524     for (i=1; i<n; i++) {
3525        v    = aa + bs2*ai[i];
3526        vi   = aj + ai[i];
3527        nz   = ai[i+1] - ai[i];
3528       idx   = bs*i;
3529        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3530       for(k=0;k<nz;k++) {
3531           jdx   = bs*vi[k];
3532           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3533           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3534           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3535           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3536 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3537 
3538           v   +=  bs2;
3539         }
3540 
3541        x[idx]   = s1;
3542        x[1+idx] = s2;
3543        x[2+idx] = s3;
3544        x[3+idx] = s4;
3545     }
3546 
3547    /* backward solve the upper triangular */
3548   for (i=n-1; i>=0; i--){
3549      v   = aa + bs2*ai[2*n-i];
3550      vi  = aj + ai[2*n-i];
3551      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3552      idt = bs*i;
3553      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3554 
3555     for(k=0;k<nz;k++){
3556       idx   = bs*vi[k];
3557        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3558        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3559        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3560        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3561        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3562 
3563         v   +=  bs2;
3564     }
3565     /* x = inv_diagonal*x */
3566    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3567    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3568    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3569    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3570 
3571   }
3572 
3573   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3574   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3575   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3576   PetscFunctionReturn(0);
3577 }
3578 #endif
3579 
3580 #undef __FUNCT__
3581 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3582 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3583 {
3584     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3585     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3586     PetscErrorCode    ierr;
3587     PetscInt          idx,jdx,idt;
3588     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3589     const MatScalar   *aa=a->a,*v;
3590     PetscScalar       *x;
3591     const PetscScalar *b;
3592     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3593 
3594     PetscFunctionBegin;
3595     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3596     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3597     /* forward solve the lower triangular */
3598     idx    = 0;
3599     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3600     for (i=1; i<n; i++) {
3601        v    = aa + bs2*ai[i];
3602        vi   = aj + ai[i];
3603        nz   = ai[i+1] - ai[i];
3604       idx   = bs*i;
3605        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3606       for(k=0;k<nz;k++) {
3607           jdx   = bs*vi[k];
3608           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3609           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3610           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3611           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3612 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3613 
3614           v   +=  bs2;
3615         }
3616 
3617        x[idx]   = s1;
3618        x[1+idx] = s2;
3619        x[2+idx] = s3;
3620        x[3+idx] = s4;
3621     }
3622 
3623    /* backward solve the upper triangular */
3624   for (i=n-1; i>=0; i--){
3625     v   = aa + bs2*(adiag[i+1]+1);
3626      vi  = aj + adiag[i+1]+1;
3627      nz  = adiag[i] - adiag[i+1]-1;
3628      idt = bs*i;
3629      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3630 
3631     for(k=0;k<nz;k++){
3632       idx   = bs*vi[k];
3633        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3634        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3635        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3636        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3637        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3638 
3639         v   +=  bs2;
3640     }
3641     /* x = inv_diagonal*x */
3642    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3643    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3644    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3645    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3646 
3647   }
3648 
3649   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3650   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3651   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3652   PetscFunctionReturn(0);
3653 }
3654 
3655 #undef __FUNCT__
3656 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3657 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3658 {
3659   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3660   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3661   PetscErrorCode ierr;
3662   PetscInt       *diag = a->diag;
3663   MatScalar      *aa=a->a;
3664   PetscScalar    *x,*b;
3665 
3666   PetscFunctionBegin;
3667   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3668   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3669 
3670   {
3671     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3672     MatScalar  *v,*t=(MatScalar *)x;
3673     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3674 
3675     /* forward solve the lower triangular */
3676     idx  = 0;
3677     t[0] = (MatScalar)b[0];
3678     t[1] = (MatScalar)b[1];
3679     t[2] = (MatScalar)b[2];
3680     t[3] = (MatScalar)b[3];
3681     for (i=1; i<n; i++) {
3682       v     =  aa      + 16*ai[i];
3683       vi    =  aj      + ai[i];
3684       nz    =  diag[i] - ai[i];
3685       idx   +=  4;
3686       s1 = (MatScalar)b[idx];
3687       s2 = (MatScalar)b[1+idx];
3688       s3 = (MatScalar)b[2+idx];
3689       s4 = (MatScalar)b[3+idx];
3690       while (nz--) {
3691         jdx = 4*(*vi++);
3692         x1  = t[jdx];
3693         x2  = t[1+jdx];
3694         x3  = t[2+jdx];
3695         x4  = t[3+jdx];
3696         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3697         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3698         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3699         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3700         v    += 16;
3701       }
3702       t[idx]   = s1;
3703       t[1+idx] = s2;
3704       t[2+idx] = s3;
3705       t[3+idx] = s4;
3706     }
3707     /* backward solve the upper triangular */
3708     idt = 4*(n-1);
3709     for (i=n-1; i>=0; i--){
3710       ai16 = 16*diag[i];
3711       v    = aa + ai16 + 16;
3712       vi   = aj + diag[i] + 1;
3713       nz   = ai[i+1] - diag[i] - 1;
3714       s1   = t[idt];
3715       s2   = t[1+idt];
3716       s3   = t[2+idt];
3717       s4   = t[3+idt];
3718       while (nz--) {
3719         idx = 4*(*vi++);
3720         x1  = (MatScalar)x[idx];
3721         x2  = (MatScalar)x[1+idx];
3722         x3  = (MatScalar)x[2+idx];
3723         x4  = (MatScalar)x[3+idx];
3724         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3725         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3726         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3727         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3728         v    += 16;
3729       }
3730       v        = aa + ai16;
3731       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3732       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3733       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3734       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3735       idt -= 4;
3736     }
3737   }
3738 
3739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3741   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3742   PetscFunctionReturn(0);
3743 }
3744 
3745 #if defined (PETSC_HAVE_SSE)
3746 
3747 #include PETSC_HAVE_SSE
3748 #undef __FUNCT__
3749 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3750 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3751 {
3752   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3753   unsigned short *aj=(unsigned short *)a->j;
3754   PetscErrorCode ierr;
3755   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3756   MatScalar      *aa=a->a;
3757   PetscScalar    *x,*b;
3758 
3759   PetscFunctionBegin;
3760   SSE_SCOPE_BEGIN;
3761   /*
3762      Note: This code currently uses demotion of double
3763      to float when performing the mixed-mode computation.
3764      This may not be numerically reasonable for all applications.
3765   */
3766   PREFETCH_NTA(aa+16*ai[1]);
3767 
3768   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3769   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3770   {
3771     /* x will first be computed in single precision then promoted inplace to double */
3772     MatScalar      *v,*t=(MatScalar *)x;
3773     int            nz,i,idt,ai16;
3774     unsigned int   jdx,idx;
3775     unsigned short *vi;
3776     /* Forward solve the lower triangular factor. */
3777 
3778     /* First block is the identity. */
3779     idx  = 0;
3780     CONVERT_DOUBLE4_FLOAT4(t,b);
3781     v    =  aa + 16*((unsigned int)ai[1]);
3782 
3783     for (i=1; i<n;) {
3784       PREFETCH_NTA(&v[8]);
3785       vi   =  aj      + ai[i];
3786       nz   =  diag[i] - ai[i];
3787       idx +=  4;
3788 
3789       /* Demote RHS from double to float. */
3790       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3791       LOAD_PS(&t[idx],XMM7);
3792 
3793       while (nz--) {
3794         PREFETCH_NTA(&v[16]);
3795         jdx = 4*((unsigned int)(*vi++));
3796 
3797         /* 4x4 Matrix-Vector product with negative accumulation: */
3798         SSE_INLINE_BEGIN_2(&t[jdx],v)
3799           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3800 
3801           /* First Column */
3802           SSE_COPY_PS(XMM0,XMM6)
3803           SSE_SHUFFLE(XMM0,XMM0,0x00)
3804           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3805           SSE_SUB_PS(XMM7,XMM0)
3806 
3807           /* Second Column */
3808           SSE_COPY_PS(XMM1,XMM6)
3809           SSE_SHUFFLE(XMM1,XMM1,0x55)
3810           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3811           SSE_SUB_PS(XMM7,XMM1)
3812 
3813           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3814 
3815           /* Third Column */
3816           SSE_COPY_PS(XMM2,XMM6)
3817           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3818           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3819           SSE_SUB_PS(XMM7,XMM2)
3820 
3821           /* Fourth Column */
3822           SSE_COPY_PS(XMM3,XMM6)
3823           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3824           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3825           SSE_SUB_PS(XMM7,XMM3)
3826         SSE_INLINE_END_2
3827 
3828         v  += 16;
3829       }
3830       v    =  aa + 16*ai[++i];
3831       PREFETCH_NTA(v);
3832       STORE_PS(&t[idx],XMM7);
3833     }
3834 
3835     /* Backward solve the upper triangular factor.*/
3836 
3837     idt  = 4*(n-1);
3838     ai16 = 16*diag[n-1];
3839     v    = aa + ai16 + 16;
3840     for (i=n-1; i>=0;){
3841       PREFETCH_NTA(&v[8]);
3842       vi = aj + diag[i] + 1;
3843       nz = ai[i+1] - diag[i] - 1;
3844 
3845       LOAD_PS(&t[idt],XMM7);
3846 
3847       while (nz--) {
3848         PREFETCH_NTA(&v[16]);
3849         idx = 4*((unsigned int)(*vi++));
3850 
3851         /* 4x4 Matrix-Vector Product with negative accumulation: */
3852         SSE_INLINE_BEGIN_2(&t[idx],v)
3853           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3854 
3855           /* First Column */
3856           SSE_COPY_PS(XMM0,XMM6)
3857           SSE_SHUFFLE(XMM0,XMM0,0x00)
3858           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3859           SSE_SUB_PS(XMM7,XMM0)
3860 
3861           /* Second Column */
3862           SSE_COPY_PS(XMM1,XMM6)
3863           SSE_SHUFFLE(XMM1,XMM1,0x55)
3864           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3865           SSE_SUB_PS(XMM7,XMM1)
3866 
3867           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3868 
3869           /* Third Column */
3870           SSE_COPY_PS(XMM2,XMM6)
3871           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3872           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3873           SSE_SUB_PS(XMM7,XMM2)
3874 
3875           /* Fourth Column */
3876           SSE_COPY_PS(XMM3,XMM6)
3877           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3878           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3879           SSE_SUB_PS(XMM7,XMM3)
3880         SSE_INLINE_END_2
3881         v  += 16;
3882       }
3883       v    = aa + ai16;
3884       ai16 = 16*diag[--i];
3885       PREFETCH_NTA(aa+ai16+16);
3886       /*
3887          Scale the result by the diagonal 4x4 block,
3888          which was inverted as part of the factorization
3889       */
3890       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3891         /* First Column */
3892         SSE_COPY_PS(XMM0,XMM7)
3893         SSE_SHUFFLE(XMM0,XMM0,0x00)
3894         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3895 
3896         /* Second Column */
3897         SSE_COPY_PS(XMM1,XMM7)
3898         SSE_SHUFFLE(XMM1,XMM1,0x55)
3899         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3900         SSE_ADD_PS(XMM0,XMM1)
3901 
3902         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3903 
3904         /* Third Column */
3905         SSE_COPY_PS(XMM2,XMM7)
3906         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3907         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3908         SSE_ADD_PS(XMM0,XMM2)
3909 
3910         /* Fourth Column */
3911         SSE_COPY_PS(XMM3,XMM7)
3912         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3913         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3914         SSE_ADD_PS(XMM0,XMM3)
3915 
3916         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3917       SSE_INLINE_END_3
3918 
3919       v    = aa + ai16 + 16;
3920       idt -= 4;
3921     }
3922 
3923     /* Convert t from single precision back to double precision (inplace)*/
3924     idt = 4*(n-1);
3925     for (i=n-1;i>=0;i--) {
3926       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3927       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3928       PetscScalar *xtemp=&x[idt];
3929       MatScalar   *ttemp=&t[idt];
3930       xtemp[3] = (PetscScalar)ttemp[3];
3931       xtemp[2] = (PetscScalar)ttemp[2];
3932       xtemp[1] = (PetscScalar)ttemp[1];
3933       xtemp[0] = (PetscScalar)ttemp[0];
3934       idt -= 4;
3935     }
3936 
3937   } /* End of artificial scope. */
3938   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3939   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3940   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3941   SSE_SCOPE_END;
3942   PetscFunctionReturn(0);
3943 }
3944 
3945 #undef __FUNCT__
3946 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3947 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3948 {
3949   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3950   int            *aj=a->j;
3951   PetscErrorCode ierr;
3952   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3953   MatScalar      *aa=a->a;
3954   PetscScalar    *x,*b;
3955 
3956   PetscFunctionBegin;
3957   SSE_SCOPE_BEGIN;
3958   /*
3959      Note: This code currently uses demotion of double
3960      to float when performing the mixed-mode computation.
3961      This may not be numerically reasonable for all applications.
3962   */
3963   PREFETCH_NTA(aa+16*ai[1]);
3964 
3965   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3966   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3967   {
3968     /* x will first be computed in single precision then promoted inplace to double */
3969     MatScalar *v,*t=(MatScalar *)x;
3970     int       nz,i,idt,ai16;
3971     int       jdx,idx;
3972     int       *vi;
3973     /* Forward solve the lower triangular factor. */
3974 
3975     /* First block is the identity. */
3976     idx  = 0;
3977     CONVERT_DOUBLE4_FLOAT4(t,b);
3978     v    =  aa + 16*ai[1];
3979 
3980     for (i=1; i<n;) {
3981       PREFETCH_NTA(&v[8]);
3982       vi   =  aj      + ai[i];
3983       nz   =  diag[i] - ai[i];
3984       idx +=  4;
3985 
3986       /* Demote RHS from double to float. */
3987       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3988       LOAD_PS(&t[idx],XMM7);
3989 
3990       while (nz--) {
3991         PREFETCH_NTA(&v[16]);
3992         jdx = 4*(*vi++);
3993 /*          jdx = *vi++; */
3994 
3995         /* 4x4 Matrix-Vector product with negative accumulation: */
3996         SSE_INLINE_BEGIN_2(&t[jdx],v)
3997           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3998 
3999           /* First Column */
4000           SSE_COPY_PS(XMM0,XMM6)
4001           SSE_SHUFFLE(XMM0,XMM0,0x00)
4002           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4003           SSE_SUB_PS(XMM7,XMM0)
4004 
4005           /* Second Column */
4006           SSE_COPY_PS(XMM1,XMM6)
4007           SSE_SHUFFLE(XMM1,XMM1,0x55)
4008           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4009           SSE_SUB_PS(XMM7,XMM1)
4010 
4011           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4012 
4013           /* Third Column */
4014           SSE_COPY_PS(XMM2,XMM6)
4015           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4016           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4017           SSE_SUB_PS(XMM7,XMM2)
4018 
4019           /* Fourth Column */
4020           SSE_COPY_PS(XMM3,XMM6)
4021           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4022           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4023           SSE_SUB_PS(XMM7,XMM3)
4024         SSE_INLINE_END_2
4025 
4026         v  += 16;
4027       }
4028       v    =  aa + 16*ai[++i];
4029       PREFETCH_NTA(v);
4030       STORE_PS(&t[idx],XMM7);
4031     }
4032 
4033     /* Backward solve the upper triangular factor.*/
4034 
4035     idt  = 4*(n-1);
4036     ai16 = 16*diag[n-1];
4037     v    = aa + ai16 + 16;
4038     for (i=n-1; i>=0;){
4039       PREFETCH_NTA(&v[8]);
4040       vi = aj + diag[i] + 1;
4041       nz = ai[i+1] - diag[i] - 1;
4042 
4043       LOAD_PS(&t[idt],XMM7);
4044 
4045       while (nz--) {
4046         PREFETCH_NTA(&v[16]);
4047         idx = 4*(*vi++);
4048 /*          idx = *vi++; */
4049 
4050         /* 4x4 Matrix-Vector Product with negative accumulation: */
4051         SSE_INLINE_BEGIN_2(&t[idx],v)
4052           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4053 
4054           /* First Column */
4055           SSE_COPY_PS(XMM0,XMM6)
4056           SSE_SHUFFLE(XMM0,XMM0,0x00)
4057           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4058           SSE_SUB_PS(XMM7,XMM0)
4059 
4060           /* Second Column */
4061           SSE_COPY_PS(XMM1,XMM6)
4062           SSE_SHUFFLE(XMM1,XMM1,0x55)
4063           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4064           SSE_SUB_PS(XMM7,XMM1)
4065 
4066           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4067 
4068           /* Third Column */
4069           SSE_COPY_PS(XMM2,XMM6)
4070           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4071           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4072           SSE_SUB_PS(XMM7,XMM2)
4073 
4074           /* Fourth Column */
4075           SSE_COPY_PS(XMM3,XMM6)
4076           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4077           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4078           SSE_SUB_PS(XMM7,XMM3)
4079         SSE_INLINE_END_2
4080         v  += 16;
4081       }
4082       v    = aa + ai16;
4083       ai16 = 16*diag[--i];
4084       PREFETCH_NTA(aa+ai16+16);
4085       /*
4086          Scale the result by the diagonal 4x4 block,
4087          which was inverted as part of the factorization
4088       */
4089       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4090         /* First Column */
4091         SSE_COPY_PS(XMM0,XMM7)
4092         SSE_SHUFFLE(XMM0,XMM0,0x00)
4093         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4094 
4095         /* Second Column */
4096         SSE_COPY_PS(XMM1,XMM7)
4097         SSE_SHUFFLE(XMM1,XMM1,0x55)
4098         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4099         SSE_ADD_PS(XMM0,XMM1)
4100 
4101         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4102 
4103         /* Third Column */
4104         SSE_COPY_PS(XMM2,XMM7)
4105         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4106         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4107         SSE_ADD_PS(XMM0,XMM2)
4108 
4109         /* Fourth Column */
4110         SSE_COPY_PS(XMM3,XMM7)
4111         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4112         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4113         SSE_ADD_PS(XMM0,XMM3)
4114 
4115         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4116       SSE_INLINE_END_3
4117 
4118       v    = aa + ai16 + 16;
4119       idt -= 4;
4120     }
4121 
4122     /* Convert t from single precision back to double precision (inplace)*/
4123     idt = 4*(n-1);
4124     for (i=n-1;i>=0;i--) {
4125       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4126       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4127       PetscScalar *xtemp=&x[idt];
4128       MatScalar   *ttemp=&t[idt];
4129       xtemp[3] = (PetscScalar)ttemp[3];
4130       xtemp[2] = (PetscScalar)ttemp[2];
4131       xtemp[1] = (PetscScalar)ttemp[1];
4132       xtemp[0] = (PetscScalar)ttemp[0];
4133       idt -= 4;
4134     }
4135 
4136   } /* End of artificial scope. */
4137   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4138   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4139   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4140   SSE_SCOPE_END;
4141   PetscFunctionReturn(0);
4142 }
4143 
4144 #endif
4145 
4146 #undef __FUNCT__
4147 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4148 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4149 {
4150   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4151   IS                iscol=a->col,isrow=a->row;
4152   PetscErrorCode    ierr;
4153   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4154   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4155   const MatScalar   *aa=a->a,*v;
4156   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4157   const PetscScalar *b;
4158 
4159   PetscFunctionBegin;
4160   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4161   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4162   t  = a->solve_work;
4163 
4164   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4165   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4166 
4167   /* forward solve the lower triangular */
4168   idx    = 3*(*r++);
4169   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4170   for (i=1; i<n; i++) {
4171     v     = aa + 9*ai[i];
4172     vi    = aj + ai[i];
4173     nz    = diag[i] - ai[i];
4174     idx   = 3*(*r++);
4175     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4176     while (nz--) {
4177       idx   = 3*(*vi++);
4178       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4179       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4180       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4181       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4182       v += 9;
4183     }
4184     idx = 3*i;
4185     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4186   }
4187   /* backward solve the upper triangular */
4188   for (i=n-1; i>=0; i--){
4189     v    = aa + 9*diag[i] + 9;
4190     vi   = aj + diag[i] + 1;
4191     nz   = ai[i+1] - diag[i] - 1;
4192     idt  = 3*i;
4193     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4194     while (nz--) {
4195       idx   = 3*(*vi++);
4196       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4197       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4198       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4199       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4200       v += 9;
4201     }
4202     idc = 3*(*c--);
4203     v   = aa + 9*diag[i];
4204     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4205     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4206     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4207   }
4208   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4209   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4210   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4211   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4212   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4213   PetscFunctionReturn(0);
4214 }
4215 
4216 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
4217 #undef __FUNCT__
4218 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4219 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4220 {
4221   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4222   IS                iscol=a->col,isrow=a->row;
4223   PetscErrorCode    ierr;
4224   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
4225   const PetscInt    *r,*c,*rout,*cout;
4226   const MatScalar   *aa=a->a,*v;
4227   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4228   const PetscScalar *b;
4229 
4230   PetscFunctionBegin;
4231   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4232   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4233   t  = a->solve_work;
4234 
4235   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4236   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4237 
4238   /* forward solve the lower triangular */
4239   idx    = 3*r[0];
4240   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4241   for (i=1; i<n; i++) {
4242     v     = aa + 9*ai[i];
4243     vi    = aj + ai[i];
4244     nz    = ai[i+1] - ai[i];
4245     idx   = 3*r[i];
4246     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4247     for(m=0;m<nz;m++){
4248       idx   = 3*vi[m];
4249       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4250       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4251       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4252       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4253       v += 9;
4254     }
4255     idx = 3*i;
4256     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4257   }
4258   /* backward solve the upper triangular */
4259   for (i=n-1; i>=0; i--){
4260     k    = 2*n-i;
4261     v    = aa + 9*ai[k];
4262     vi   = aj + ai[k];
4263     nz   = ai[k +1] - ai[k] - 1;
4264     idt  = 3*i;
4265     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4266     for(m=0;m<nz;m++){
4267       idx   = 3*vi[m];
4268       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4269       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4270       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4271       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4272       v += 9;
4273     }
4274     idc = 3*c[i];
4275     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4276     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4277     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4278   }
4279   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4280   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4281   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4282   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4283   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4284   PetscFunctionReturn(0);
4285 }
4286 #endif
4287 
4288 #undef __FUNCT__
4289 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4290 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4291 {
4292   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4293   IS                iscol=a->col,isrow=a->row;
4294   PetscErrorCode    ierr;
4295   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4296   const PetscInt    *r,*c,*rout,*cout;
4297   const MatScalar   *aa=a->a,*v;
4298   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4299   const PetscScalar *b;
4300 
4301   PetscFunctionBegin;
4302   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4303   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4304   t  = a->solve_work;
4305 
4306   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4307   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4308 
4309   /* forward solve the lower triangular */
4310   idx    = 3*r[0];
4311   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4312   for (i=1; i<n; i++) {
4313     v     = aa + 9*ai[i];
4314     vi    = aj + ai[i];
4315     nz    = ai[i+1] - ai[i];
4316     idx   = 3*r[i];
4317     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4318     for(m=0;m<nz;m++){
4319       idx   = 3*vi[m];
4320       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4321       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4322       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4323       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4324       v += 9;
4325     }
4326     idx = 3*i;
4327     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4328   }
4329   /* backward solve the upper triangular */
4330   for (i=n-1; i>=0; i--){
4331     v    = aa + 9*(adiag[i+1]+1);
4332     vi   = aj + adiag[i+1]+1;
4333     nz   = adiag[i] - adiag[i+1] - 1;
4334     idt  = 3*i;
4335     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4336     for(m=0;m<nz;m++){
4337       idx   = 3*vi[m];
4338       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4339       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4340       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4341       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4342       v += 9;
4343     }
4344     idc = 3*c[i];
4345     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4346     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4347     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4348   }
4349   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4350   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4351   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4352   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4353   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4354   PetscFunctionReturn(0);
4355 }
4356 
4357 /*
4358       Special case where the matrix was ILU(0) factored in the natural
4359    ordering. This eliminates the need for the column and row permutation.
4360 */
4361 #undef __FUNCT__
4362 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4363 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4364 {
4365   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4366   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4367   PetscErrorCode    ierr;
4368   PetscInt          *diag = a->diag;
4369   const MatScalar   *aa=a->a,*v;
4370   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4371   const PetscScalar *b;
4372   PetscInt          jdx,idt,idx,nz,*vi,i;
4373 
4374   PetscFunctionBegin;
4375   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4377 
4378   /* forward solve the lower triangular */
4379   idx    = 0;
4380   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4381   for (i=1; i<n; i++) {
4382     v     =  aa      + 9*ai[i];
4383     vi    =  aj      + ai[i];
4384     nz    =  diag[i] - ai[i];
4385     idx   +=  3;
4386     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4387     while (nz--) {
4388       jdx   = 3*(*vi++);
4389       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4390       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4391       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4392       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4393       v    += 9;
4394     }
4395     x[idx]   = s1;
4396     x[1+idx] = s2;
4397     x[2+idx] = s3;
4398   }
4399   /* backward solve the upper triangular */
4400   for (i=n-1; i>=0; i--){
4401     v    = aa + 9*diag[i] + 9;
4402     vi   = aj + diag[i] + 1;
4403     nz   = ai[i+1] - diag[i] - 1;
4404     idt  = 3*i;
4405     s1 = x[idt];  s2 = x[1+idt];
4406     s3 = x[2+idt];
4407     while (nz--) {
4408       idx   = 3*(*vi++);
4409       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4410       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4411       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4412       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4413       v    += 9;
4414     }
4415     v        = aa +  9*diag[i];
4416     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4417     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4418     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4419   }
4420 
4421   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4422   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4423   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4424   PetscFunctionReturn(0);
4425 }
4426 
4427 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
4428 #undef __FUNCT__
4429 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4430 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4431 {
4432     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4433     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4434     PetscErrorCode    ierr;
4435     PetscInt          idx,jdx,idt;
4436     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4437     const MatScalar   *aa=a->a,*v;
4438     PetscScalar       *x;
4439     const PetscScalar *b;
4440     PetscScalar        s1,s2,s3,x1,x2,x3;
4441 
4442     PetscFunctionBegin;
4443     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4444     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4445     /* forward solve the lower triangular */
4446     idx    = 0;
4447     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4448     for (i=1; i<n; i++) {
4449        v    = aa + bs2*ai[i];
4450        vi   = aj + ai[i];
4451        nz   = ai[i+1] - ai[i];
4452       idx   = bs*i;
4453        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4454       for(k=0;k<nz;k++){
4455          jdx   = bs*vi[k];
4456           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4457           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4458           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4459           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4460 
4461           v   +=  bs2;
4462         }
4463 
4464        x[idx]   = s1;
4465        x[1+idx] = s2;
4466        x[2+idx] = s3;
4467     }
4468 
4469    /* backward solve the upper triangular */
4470   for (i=n-1; i>=0; i--){
4471      v   = aa + bs2*ai[2*n-i];
4472      vi  = aj + ai[2*n-i];
4473      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4474      idt = bs*i;
4475      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4476 
4477      for(k=0;k<nz;k++){
4478        idx   = bs*vi[k];
4479        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4480        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4481        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4482        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4483 
4484         v   +=  bs2;
4485     }
4486     /* x = inv_diagonal*x */
4487    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4488    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4489    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4490 
4491   }
4492 
4493   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4494   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4495   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4496   PetscFunctionReturn(0);
4497 }
4498 #endif
4499 
4500 #undef __FUNCT__
4501 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4502 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4503 {
4504     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4505     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4506     PetscErrorCode    ierr;
4507     PetscInt          idx,jdx,idt;
4508     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4509     const MatScalar   *aa=a->a,*v;
4510     PetscScalar       *x;
4511     const PetscScalar *b;
4512     PetscScalar        s1,s2,s3,x1,x2,x3;
4513 
4514     PetscFunctionBegin;
4515     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4516     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4517     /* forward solve the lower triangular */
4518     idx    = 0;
4519     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4520     for (i=1; i<n; i++) {
4521        v    = aa + bs2*ai[i];
4522        vi   = aj + ai[i];
4523        nz   = ai[i+1] - ai[i];
4524       idx   = bs*i;
4525        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4526       for(k=0;k<nz;k++){
4527          jdx   = bs*vi[k];
4528           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4529           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4530           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4531           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4532 
4533           v   +=  bs2;
4534         }
4535 
4536        x[idx]   = s1;
4537        x[1+idx] = s2;
4538        x[2+idx] = s3;
4539     }
4540 
4541    /* backward solve the upper triangular */
4542   for (i=n-1; i>=0; i--){
4543     v   = aa + bs2*(adiag[i+1]+1);
4544      vi  = aj + adiag[i+1]+1;
4545      nz  = adiag[i] - adiag[i+1]-1;
4546      idt = bs*i;
4547      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4548 
4549      for(k=0;k<nz;k++){
4550        idx   = bs*vi[k];
4551        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4552        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4553        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4554        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4555 
4556         v   +=  bs2;
4557     }
4558     /* x = inv_diagonal*x */
4559    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4560    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4561    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4562 
4563   }
4564 
4565   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4566   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4567   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4568   PetscFunctionReturn(0);
4569 }
4570 
4571 #undef __FUNCT__
4572 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4573 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4574 {
4575   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4576   IS                iscol=a->col,isrow=a->row;
4577   PetscErrorCode    ierr;
4578   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4579   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4580   const MatScalar   *aa=a->a,*v;
4581   PetscScalar       *x,s1,s2,x1,x2,*t;
4582   const PetscScalar *b;
4583 
4584   PetscFunctionBegin;
4585   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4586   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4587   t  = a->solve_work;
4588 
4589   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4590   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4591 
4592   /* forward solve the lower triangular */
4593   idx    = 2*(*r++);
4594   t[0] = b[idx]; t[1] = b[1+idx];
4595   for (i=1; i<n; i++) {
4596     v     = aa + 4*ai[i];
4597     vi    = aj + ai[i];
4598     nz    = diag[i] - ai[i];
4599     idx   = 2*(*r++);
4600     s1  = b[idx]; s2 = b[1+idx];
4601     while (nz--) {
4602       idx   = 2*(*vi++);
4603       x1    = t[idx]; x2 = t[1+idx];
4604       s1 -= v[0]*x1 + v[2]*x2;
4605       s2 -= v[1]*x1 + v[3]*x2;
4606       v += 4;
4607     }
4608     idx = 2*i;
4609     t[idx] = s1; t[1+idx] = s2;
4610   }
4611   /* backward solve the upper triangular */
4612   for (i=n-1; i>=0; i--){
4613     v    = aa + 4*diag[i] + 4;
4614     vi   = aj + diag[i] + 1;
4615     nz   = ai[i+1] - diag[i] - 1;
4616     idt  = 2*i;
4617     s1 = t[idt]; s2 = t[1+idt];
4618     while (nz--) {
4619       idx   = 2*(*vi++);
4620       x1    = t[idx]; x2 = t[1+idx];
4621       s1 -= v[0]*x1 + v[2]*x2;
4622       s2 -= v[1]*x1 + v[3]*x2;
4623       v += 4;
4624     }
4625     idc = 2*(*c--);
4626     v   = aa + 4*diag[i];
4627     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4628     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4629   }
4630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4632   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4634   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4635   PetscFunctionReturn(0);
4636 }
4637 
4638 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
4639 #undef __FUNCT__
4640 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4641 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4642 {
4643   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4644   IS                iscol=a->col,isrow=a->row;
4645   PetscErrorCode    ierr;
4646   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
4647   const PetscInt    *r,*c,*rout,*cout;
4648   const MatScalar   *aa=a->a,*v;
4649   PetscScalar       *x,s1,s2,x1,x2,*t;
4650   const PetscScalar *b;
4651 
4652   PetscFunctionBegin;
4653   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4655   t  = a->solve_work;
4656 
4657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4659 
4660   /* forward solve the lower triangular */
4661   idx    = 2*r[0];
4662   t[0] = b[idx]; t[1] = b[1+idx];
4663   for (i=1; i<n; i++) {
4664     v     = aa + 4*ai[i];
4665     vi    = aj + ai[i];
4666     nz    = ai[i+1] - ai[i];
4667     idx   = 2*r[i];
4668     s1  = b[idx]; s2 = b[1+idx];
4669     for(m=0;m<nz;m++){
4670       jdx   = 2*vi[m];
4671       x1    = t[jdx]; x2 = t[1+jdx];
4672       s1 -= v[0]*x1 + v[2]*x2;
4673       s2 -= v[1]*x1 + v[3]*x2;
4674       v += 4;
4675     }
4676     idx = 2*i;
4677     t[idx] = s1; t[1+idx] = s2;
4678   }
4679   /* backward solve the upper triangular */
4680   for (i=n-1; i>=0; i--){
4681     k = 2*n-i;
4682     v    = aa + 4*ai[k];
4683     vi   = aj + ai[k];
4684     nz   = ai[k +1] - ai[k] - 1;
4685     idt  = 2*i;
4686     s1 = t[idt]; s2 = t[1+idt];
4687     for(m=0;m<nz;m++){
4688       idx   = 2*vi[m];
4689       x1    = t[idx]; x2 = t[1+idx];
4690       s1 -= v[0]*x1 + v[2]*x2;
4691       s2 -= v[1]*x1 + v[3]*x2;
4692       v += 4;
4693     }
4694     idc = 2*c[i];
4695     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4696     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4697   }
4698   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4699   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4700   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4701   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4702   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4703   PetscFunctionReturn(0);
4704 }
4705 #endif
4706 
4707 #undef __FUNCT__
4708 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4709 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4710 {
4711   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4712   IS                iscol=a->col,isrow=a->row;
4713   PetscErrorCode    ierr;
4714   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4715   const PetscInt    *r,*c,*rout,*cout;
4716   const MatScalar   *aa=a->a,*v;
4717   PetscScalar       *x,s1,s2,x1,x2,*t;
4718   const PetscScalar *b;
4719 
4720   PetscFunctionBegin;
4721   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4722   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4723   t  = a->solve_work;
4724 
4725   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4726   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4727 
4728   /* forward solve the lower triangular */
4729   idx    = 2*r[0];
4730   t[0] = b[idx]; t[1] = b[1+idx];
4731   for (i=1; i<n; i++) {
4732     v     = aa + 4*ai[i];
4733     vi    = aj + ai[i];
4734     nz    = ai[i+1] - ai[i];
4735     idx   = 2*r[i];
4736     s1  = b[idx]; s2 = b[1+idx];
4737     for(m=0;m<nz;m++){
4738       jdx   = 2*vi[m];
4739       x1    = t[jdx]; x2 = t[1+jdx];
4740       s1 -= v[0]*x1 + v[2]*x2;
4741       s2 -= v[1]*x1 + v[3]*x2;
4742       v += 4;
4743     }
4744     idx = 2*i;
4745     t[idx] = s1; t[1+idx] = s2;
4746   }
4747   /* backward solve the upper triangular */
4748   for (i=n-1; i>=0; i--){
4749     v    = aa + 4*(adiag[i+1]+1);
4750     vi   = aj + adiag[i+1]+1;
4751     nz   = adiag[i] - adiag[i+1] - 1;
4752     idt  = 2*i;
4753     s1 = t[idt]; s2 = t[1+idt];
4754     for(m=0;m<nz;m++){
4755       idx   = 2*vi[m];
4756       x1    = t[idx]; x2 = t[1+idx];
4757       s1 -= v[0]*x1 + v[2]*x2;
4758       s2 -= v[1]*x1 + v[3]*x2;
4759       v += 4;
4760     }
4761     idc = 2*c[i];
4762     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4763     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4764   }
4765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4767   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4769   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4770   PetscFunctionReturn(0);
4771 }
4772 
4773 /*
4774       Special case where the matrix was ILU(0) factored in the natural
4775    ordering. This eliminates the need for the column and row permutation.
4776 */
4777 #undef __FUNCT__
4778 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4779 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4780 {
4781   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4782   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4783   PetscErrorCode    ierr;
4784   PetscInt          *diag = a->diag;
4785   const MatScalar   *aa=a->a,*v;
4786   PetscScalar       *x,s1,s2,x1,x2;
4787   const PetscScalar *b;
4788   PetscInt          jdx,idt,idx,nz,*vi,i;
4789 
4790   PetscFunctionBegin;
4791   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4792   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4793 
4794   /* forward solve the lower triangular */
4795   idx    = 0;
4796   x[0]   = b[0]; x[1] = b[1];
4797   for (i=1; i<n; i++) {
4798     v     =  aa      + 4*ai[i];
4799     vi    =  aj      + ai[i];
4800     nz    =  diag[i] - ai[i];
4801     idx   +=  2;
4802     s1  =  b[idx];s2 = b[1+idx];
4803     while (nz--) {
4804       jdx   = 2*(*vi++);
4805       x1    = x[jdx];x2 = x[1+jdx];
4806       s1 -= v[0]*x1 + v[2]*x2;
4807       s2 -= v[1]*x1 + v[3]*x2;
4808       v    += 4;
4809     }
4810     x[idx]   = s1;
4811     x[1+idx] = s2;
4812   }
4813   /* backward solve the upper triangular */
4814   for (i=n-1; i>=0; i--){
4815     v    = aa + 4*diag[i] + 4;
4816     vi   = aj + diag[i] + 1;
4817     nz   = ai[i+1] - diag[i] - 1;
4818     idt  = 2*i;
4819     s1 = x[idt];  s2 = x[1+idt];
4820     while (nz--) {
4821       idx   = 2*(*vi++);
4822       x1    = x[idx];   x2 = x[1+idx];
4823       s1 -= v[0]*x1 + v[2]*x2;
4824       s2 -= v[1]*x1 + v[3]*x2;
4825       v    += 4;
4826     }
4827     v        = aa +  4*diag[i];
4828     x[idt]   = v[0]*s1 + v[2]*s2;
4829     x[1+idt] = v[1]*s1 + v[3]*s2;
4830   }
4831 
4832   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4833   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4834   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4835   PetscFunctionReturn(0);
4836 }
4837 
4838 #if defined(OLD_ROUTINE_TO_BE_REPLACED)
4839 #undef __FUNCT__
4840 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4841 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4842 {
4843     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4844     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4845     PetscErrorCode    ierr;
4846     PetscInt          jdx;
4847     const MatScalar   *aa=a->a,*v;
4848     PetscScalar       *x,s1,s2,x1,x2;
4849     const PetscScalar *b;
4850 
4851     PetscFunctionBegin;
4852     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4853     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4854     /* forward solve the lower triangular */
4855     idx    = 0;
4856     x[0] = b[idx]; x[1] = b[1+idx];
4857     for (i=1; i<n; i++) {
4858         v   = aa + 4*ai[i];
4859        vi   = aj + ai[i];
4860        nz   = ai[i+1] - ai[i];
4861        idx  = 2*i;
4862        s1   = b[idx];s2 = b[1+idx];
4863       for(k=0;k<nz;k++){
4864          jdx   = 2*vi[k];
4865           x1    = x[jdx];x2 = x[1+jdx];
4866           s1   -= v[0]*x1 + v[2]*x2;
4867           s2   -= v[1]*x1 + v[3]*x2;
4868            v   +=  4;
4869         }
4870        x[idx]   = s1;
4871        x[1+idx] = s2;
4872     }
4873 
4874    /* backward solve the upper triangular */
4875   for (i=n-1; i>=0; i--){
4876      v   = aa + 4*ai[2*n-i];
4877      vi  = aj + ai[2*n-i];
4878      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4879      idt = 2*i;
4880      s1 = x[idt];  s2 = x[1+idt];
4881      for(k=0;k<nz;k++){
4882       idx   = 2*vi[k];
4883        x1    = x[idx];   x2 = x[1+idx];
4884        s1 -= v[0]*x1 + v[2]*x2;
4885        s2 -= v[1]*x1 + v[3]*x2;
4886          v    += 4;
4887     }
4888     /* x = inv_diagonal*x */
4889    x[idt]   = v[0]*s1 + v[2]*s2;
4890    x[1+idt] = v[1]*s1 + v[3]*s2;
4891   }
4892 
4893   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4894   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4895   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4896   PetscFunctionReturn(0);
4897 }
4898 #endif
4899 
4900 #undef __FUNCT__
4901 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4902 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4903 {
4904     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4905     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4906     PetscErrorCode    ierr;
4907     PetscInt          jdx;
4908     const MatScalar   *aa=a->a,*v;
4909     PetscScalar       *x,s1,s2,x1,x2;
4910     const PetscScalar *b;
4911 
4912     PetscFunctionBegin;
4913     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4914     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4915     /* forward solve the lower triangular */
4916     idx    = 0;
4917     x[0] = b[idx]; x[1] = b[1+idx];
4918     for (i=1; i<n; i++) {
4919         v   = aa + 4*ai[i];
4920        vi   = aj + ai[i];
4921        nz   = ai[i+1] - ai[i];
4922        idx  = 2*i;
4923        s1   = b[idx];s2 = b[1+idx];
4924       for(k=0;k<nz;k++){
4925          jdx   = 2*vi[k];
4926           x1    = x[jdx];x2 = x[1+jdx];
4927           s1   -= v[0]*x1 + v[2]*x2;
4928           s2   -= v[1]*x1 + v[3]*x2;
4929            v   +=  4;
4930         }
4931        x[idx]   = s1;
4932        x[1+idx] = s2;
4933     }
4934 
4935    /* backward solve the upper triangular */
4936   for (i=n-1; i>=0; i--){
4937      v   = aa + 4*(adiag[i+1]+1);
4938      vi  = aj + adiag[i+1]+1;
4939      nz  = adiag[i] - adiag[i+1]-1;
4940      idt = 2*i;
4941      s1 = x[idt];  s2 = x[1+idt];
4942      for(k=0;k<nz;k++){
4943       idx   = 2*vi[k];
4944        x1    = x[idx];   x2 = x[1+idx];
4945        s1 -= v[0]*x1 + v[2]*x2;
4946        s2 -= v[1]*x1 + v[3]*x2;
4947          v    += 4;
4948     }
4949     /* x = inv_diagonal*x */
4950    x[idt]   = v[0]*s1 + v[2]*s2;
4951    x[1+idt] = v[1]*s1 + v[3]*s2;
4952   }
4953 
4954   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4955   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4956   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4957   PetscFunctionReturn(0);
4958 }
4959 
4960 #undef __FUNCT__
4961 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4962 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4963 {
4964   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4965   IS             iscol=a->col,isrow=a->row;
4966   PetscErrorCode ierr;
4967   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4968   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4969   MatScalar      *aa=a->a,*v;
4970   PetscScalar    *x,*b,s1,*t;
4971 
4972   PetscFunctionBegin;
4973   if (!n) PetscFunctionReturn(0);
4974 
4975   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4977   t  = a->solve_work;
4978 
4979   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4980   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4981 
4982   /* forward solve the lower triangular */
4983   t[0] = b[*r++];
4984   for (i=1; i<n; i++) {
4985     v     = aa + ai[i];
4986     vi    = aj + ai[i];
4987     nz    = diag[i] - ai[i];
4988     s1  = b[*r++];
4989     while (nz--) {
4990       s1 -= (*v++)*t[*vi++];
4991     }
4992     t[i] = s1;
4993   }
4994   /* backward solve the upper triangular */
4995   for (i=n-1; i>=0; i--){
4996     v    = aa + diag[i] + 1;
4997     vi   = aj + diag[i] + 1;
4998     nz   = ai[i+1] - diag[i] - 1;
4999     s1 = t[i];
5000     while (nz--) {
5001       s1 -= (*v++)*t[*vi++];
5002     }
5003     x[*c--] = t[i] = aa[diag[i]]*s1;
5004   }
5005 
5006   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5007   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5008   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5009   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5010   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5011   PetscFunctionReturn(0);
5012 }
5013 /*
5014       Special case where the matrix was ILU(0) factored in the natural
5015    ordering. This eliminates the need for the column and row permutation.
5016 */
5017 #undef __FUNCT__
5018 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5019 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5020 {
5021   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5022   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5023   PetscErrorCode ierr;
5024   PetscInt       *diag = a->diag;
5025   MatScalar      *aa=a->a;
5026   PetscScalar    *x,*b;
5027   PetscScalar    s1,x1;
5028   MatScalar      *v;
5029   PetscInt       jdx,idt,idx,nz,*vi,i;
5030 
5031   PetscFunctionBegin;
5032   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5034 
5035   /* forward solve the lower triangular */
5036   idx    = 0;
5037   x[0]   = b[0];
5038   for (i=1; i<n; i++) {
5039     v     =  aa      + ai[i];
5040     vi    =  aj      + ai[i];
5041     nz    =  diag[i] - ai[i];
5042     idx   +=  1;
5043     s1  =  b[idx];
5044     while (nz--) {
5045       jdx   = *vi++;
5046       x1    = x[jdx];
5047       s1 -= v[0]*x1;
5048       v    += 1;
5049     }
5050     x[idx]   = s1;
5051   }
5052   /* backward solve the upper triangular */
5053   for (i=n-1; i>=0; i--){
5054     v    = aa + diag[i] + 1;
5055     vi   = aj + diag[i] + 1;
5056     nz   = ai[i+1] - diag[i] - 1;
5057     idt  = i;
5058     s1 = x[idt];
5059     while (nz--) {
5060       idx   = *vi++;
5061       x1    = x[idx];
5062       s1 -= v[0]*x1;
5063       v    += 1;
5064     }
5065     v        = aa +  diag[i];
5066     x[idt]   = v[0]*s1;
5067   }
5068   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5069   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5070   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5071   PetscFunctionReturn(0);
5072 }
5073 
5074 /* ----------------------------------------------------------------*/
5075 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5076 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5077 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
5078 
5079 #undef __FUNCT__
5080 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
5081 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
5082 {
5083   Mat            C=B;
5084   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5085   IS             isrow = b->row,isicol = b->icol;
5086   PetscErrorCode ierr;
5087   const PetscInt *r,*ic,*ics;
5088   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5089   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5090   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5091   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5092   MatScalar      *v_work;
5093   PetscTruth     col_identity,row_identity,both_identity;
5094 
5095   PetscFunctionBegin;
5096   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5097   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5098 
5099   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5100   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5101   ics  = ic;
5102 
5103   /* generate work space needed by dense LU factorization */
5104   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5105 
5106   for (i=0; i<n; i++){
5107     /* zero rtmp */
5108     /* L part */
5109     nz    = bi[i+1] - bi[i];
5110     bjtmp = bj + bi[i];
5111     for  (j=0; j<nz; j++){
5112       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5113     }
5114 
5115     /* U part */
5116     nz = bdiag[i] - bdiag[i+1];
5117     bjtmp = bj + bdiag[i+1]+1;
5118     for  (j=0; j<nz; j++){
5119       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5120     }
5121 
5122     /* load in initial (unfactored row) */
5123     nz    = ai[r[i]+1] - ai[r[i]];
5124     ajtmp = aj + ai[r[i]];
5125     v     = aa + bs2*ai[r[i]];
5126     for (j=0; j<nz; j++) {
5127       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5128     }
5129 
5130     /* elimination */
5131     bjtmp = bj + bi[i];
5132     nzL   = bi[i+1] - bi[i];
5133     for(k=0;k < nzL;k++) {
5134       row = bjtmp[k];
5135       pc = rtmp + bs2*row;
5136       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5137       if (flg) {
5138         pv         = b->a + bs2*bdiag[row];
5139         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5140         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5141         pv         = b->a + bs2*(bdiag[row+1]+1);
5142         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5143         for (j=0; j<nz; j++) {
5144           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5145         }
5146         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5147       }
5148     }
5149 
5150     /* finished row so stick it into b->a */
5151     /* L part */
5152     pv   = b->a + bs2*bi[i] ;
5153     pj   = b->j + bi[i] ;
5154     nz   = bi[i+1] - bi[i];
5155     for (j=0; j<nz; j++) {
5156       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5157     }
5158 
5159     /* Mark diagonal and invert diagonal for simplier triangular solves */
5160     pv  = b->a + bs2*bdiag[i];
5161     pj  = b->j + bdiag[i];
5162     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5163     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5164     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5165 
5166     /* U part */
5167     pv = b->a + bs2*(bdiag[i+1]+1);
5168     pj = b->j + bdiag[i+1]+1;
5169     nz = bdiag[i] - bdiag[i+1] - 1;
5170     for (j=0; j<nz; j++){
5171       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5172     }
5173   }
5174 
5175   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5176   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5177   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5178   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5179 
5180   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5181   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5182   both_identity = (PetscTruth) (row_identity && col_identity);
5183   if (both_identity){
5184     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5185   } else {
5186     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5187   }
5188 
5189   C->assembled = PETSC_TRUE;
5190   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5191   PetscFunctionReturn(0);
5192 }
5193 
5194 /*
5195    ilu(0) with natural ordering under new data structure.
5196    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5197    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5198 */
5199 
5200 #undef __FUNCT__
5201 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5202 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5203 {
5204 
5205   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5206   PetscErrorCode     ierr;
5207   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5208   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5209 
5210   PetscFunctionBegin;
5211   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5212   b    = (Mat_SeqBAIJ*)(fact)->data;
5213 
5214   /* allocate matrix arrays for new data structure */
5215   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5216   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5217   b->singlemalloc = PETSC_TRUE;
5218   if (!b->diag){
5219     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5220     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5221   }
5222   bdiag = b->diag;
5223 
5224   if (n > 0) {
5225     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5226   }
5227 
5228   /* set bi and bj with new data structure */
5229   bi = b->i;
5230   bj = b->j;
5231 
5232   /* L part */
5233   bi[0] = 0;
5234   for (i=0; i<n; i++){
5235     nz = adiag[i] - ai[i];
5236     bi[i+1] = bi[i] + nz;
5237     aj = a->j + ai[i];
5238     for (j=0; j<nz; j++){
5239       *bj = aj[j]; bj++;
5240     }
5241   }
5242 
5243   /* U part */
5244   bi_temp = bi[n];
5245   bdiag[n] = bi[n]-1;
5246   for (i=n-1; i>=0; i--){
5247     nz = ai[i+1] - adiag[i] - 1;
5248     bi_temp = bi_temp + nz + 1;
5249     aj = a->j + adiag[i] + 1;
5250     for (j=0; j<nz; j++){
5251       *bj = aj[j]; bj++;
5252     }
5253     /* diag[i] */
5254     *bj = i; bj++;
5255     bdiag[i] = bi_temp - 1;
5256   }
5257   PetscFunctionReturn(0);
5258 }
5259 
5260 #undef __FUNCT__
5261 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5262 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5263 {
5264   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5265   IS                 isicol;
5266   PetscErrorCode     ierr;
5267   const PetscInt     *r,*ic;
5268   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5269   PetscInt           *bi,*cols,nnz,*cols_lvl;
5270   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5271   PetscInt           i,levels,diagonal_fill;
5272   PetscTruth         col_identity,row_identity,both_identity;
5273   PetscReal          f;
5274   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5275   PetscBT            lnkbt;
5276   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5277   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5278   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5279   PetscTruth         missing;
5280   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5281 
5282   PetscFunctionBegin;
5283   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5284   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5285   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5286 
5287   f             = info->fill;
5288   levels        = (PetscInt)info->levels;
5289   diagonal_fill = (PetscInt)info->diagonal_fill;
5290   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5291 
5292   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5293   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5294   both_identity = (PetscTruth) (row_identity && col_identity);
5295 
5296   if (!levels && both_identity) {
5297     /* special case: ilu(0) with natural ordering */
5298     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5299     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5300 
5301     fact->factor = MAT_FACTOR_ILU;
5302     (fact)->info.factor_mallocs    = 0;
5303     (fact)->info.fill_ratio_given  = info->fill;
5304     (fact)->info.fill_ratio_needed = 1.0;
5305     b                = (Mat_SeqBAIJ*)(fact)->data;
5306     b->row           = isrow;
5307     b->col           = iscol;
5308     b->icol          = isicol;
5309     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5310     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5311     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5312     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5313     PetscFunctionReturn(0);
5314   }
5315 
5316   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5317   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5318 
5319   /* get new row pointers */
5320   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5321   bi[0] = 0;
5322   /* bdiag is location of diagonal in factor */
5323   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5324   bdiag[0]  = 0;
5325 
5326   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5327 
5328   /* create a linked list for storing column indices of the active row */
5329   nlnk = n + 1;
5330   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5331 
5332   /* initial FreeSpace size is f*(ai[n]+1) */
5333   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5334   current_space = free_space;
5335   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5336   current_space_lvl = free_space_lvl;
5337 
5338   for (i=0; i<n; i++) {
5339     nzi = 0;
5340     /* copy current row into linked list */
5341     nnz  = ai[r[i]+1] - ai[r[i]];
5342     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5343     cols = aj + ai[r[i]];
5344     lnk[i] = -1; /* marker to indicate if diagonal exists */
5345     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5346     nzi += nlnk;
5347 
5348     /* make sure diagonal entry is included */
5349     if (diagonal_fill && lnk[i] == -1) {
5350       fm = n;
5351       while (lnk[fm] < i) fm = lnk[fm];
5352       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5353       lnk[fm]    = i;
5354       lnk_lvl[i] = 0;
5355       nzi++; dcount++;
5356     }
5357 
5358     /* add pivot rows into the active row */
5359     nzbd = 0;
5360     prow = lnk[n];
5361     while (prow < i) {
5362       nnz      = bdiag[prow];
5363       cols     = bj_ptr[prow] + nnz + 1;
5364       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5365       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5366       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5367       nzi += nlnk;
5368       prow = lnk[prow];
5369       nzbd++;
5370     }
5371     bdiag[i] = nzbd;
5372     bi[i+1]  = bi[i] + nzi;
5373 
5374     /* if free space is not available, make more free space */
5375     if (current_space->local_remaining<nzi) {
5376       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5377       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5378       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5379       reallocs++;
5380     }
5381 
5382     /* copy data into free_space and free_space_lvl, then initialize lnk */
5383     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5384     bj_ptr[i]    = current_space->array;
5385     bjlvl_ptr[i] = current_space_lvl->array;
5386 
5387     /* make sure the active row i has diagonal entry */
5388     if (*(bj_ptr[i]+bdiag[i]) != i) {
5389       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5390     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5391     }
5392 
5393     current_space->array           += nzi;
5394     current_space->local_used      += nzi;
5395     current_space->local_remaining -= nzi;
5396     current_space_lvl->array           += nzi;
5397     current_space_lvl->local_used      += nzi;
5398     current_space_lvl->local_remaining -= nzi;
5399   }
5400 
5401   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5402   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5403 
5404   /* destroy list of free space and other temporary arrays */
5405   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5406 
5407   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5408   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5409 
5410   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5411   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5412   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5413 
5414 #if defined(PETSC_USE_INFO)
5415   {
5416     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5417     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5418     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5419     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5420     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5421     if (diagonal_fill) {
5422       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5423     }
5424   }
5425 #endif
5426 
5427   /* put together the new matrix */
5428   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5429   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5430   b = (Mat_SeqBAIJ*)(fact)->data;
5431   b->free_a       = PETSC_TRUE;
5432   b->free_ij      = PETSC_TRUE;
5433   b->singlemalloc = PETSC_FALSE;
5434   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5435   b->j          = bj;
5436   b->i          = bi;
5437   b->diag       = bdiag;
5438   b->free_diag  = PETSC_TRUE;
5439   b->ilen       = 0;
5440   b->imax       = 0;
5441   b->row        = isrow;
5442   b->col        = iscol;
5443   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5444   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5445   b->icol       = isicol;
5446   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5447   /* In b structure:  Free imax, ilen, old a, old j.
5448      Allocate bdiag, solve_work, new a, new j */
5449   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5450   b->maxnz = b->nz = bdiag[0]+1;
5451   fact->info.factor_mallocs    = reallocs;
5452   fact->info.fill_ratio_given  = f;
5453   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5454   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5455   PetscFunctionReturn(0);
5456 }
5457 
5458 
5459 /*
5460      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5461    except that the data structure of Mat_SeqAIJ is slightly different.
5462    Not a good example of code reuse.
5463 */
5464 #undef __FUNCT__
5465 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5466 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5467 {
5468   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5469   IS             isicol;
5470   PetscErrorCode ierr;
5471   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5472   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5473   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5474   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5475   PetscTruth     col_identity,row_identity,both_identity,flg;
5476   PetscReal      f;
5477   PetscTruth     newdatastruct = PETSC_FALSE;
5478 
5479   PetscFunctionBegin;
5480   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5481   if (newdatastruct){
5482     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5483     PetscFunctionReturn(0);
5484   }
5485 
5486   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5487   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5488 
5489   f             = info->fill;
5490   levels        = (PetscInt)info->levels;
5491   diagonal_fill = (PetscInt)info->diagonal_fill;
5492   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5493 
5494   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5495   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5496   both_identity = (PetscTruth) (row_identity && col_identity);
5497 
5498   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5499     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5500     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5501 
5502     fact->factor = MAT_FACTOR_ILU;
5503     b            = (Mat_SeqBAIJ*)fact->data;
5504     b->row       = isrow;
5505     b->col       = iscol;
5506     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5507     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5508     b->icol      = isicol;
5509     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5510     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5511     PetscFunctionReturn(0);
5512   }
5513 
5514   /* general case perform the symbolic factorization */
5515     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5516     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5517 
5518     /* get new row pointers */
5519     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5520     ainew[0] = 0;
5521     /* don't know how many column pointers are needed so estimate */
5522     jmax = (PetscInt)(f*ai[n] + 1);
5523     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5524     /* ajfill is level of fill for each fill entry */
5525     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5526     /* fill is a linked list of nonzeros in active row */
5527     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5528     /* im is level for each filled value */
5529     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5530     /* dloc is location of diagonal in factor */
5531     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5532     dloc[0]  = 0;
5533     for (prow=0; prow<n; prow++) {
5534 
5535       /* copy prow into linked list */
5536       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5537       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5538       xi         = aj + ai[r[prow]];
5539       fill[n]    = n;
5540       fill[prow] = -1; /* marker for diagonal entry */
5541       while (nz--) {
5542 	fm  = n;
5543 	idx = ic[*xi++];
5544 	do {
5545 	  m  = fm;
5546 	  fm = fill[m];
5547 	} while (fm < idx);
5548 	fill[m]   = idx;
5549 	fill[idx] = fm;
5550 	im[idx]   = 0;
5551       }
5552 
5553       /* make sure diagonal entry is included */
5554       if (diagonal_fill && fill[prow] == -1) {
5555 	fm = n;
5556 	while (fill[fm] < prow) fm = fill[fm];
5557 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5558 	fill[fm]   = prow;
5559 	im[prow]   = 0;
5560 	nzf++;
5561 	dcount++;
5562       }
5563 
5564       nzi = 0;
5565       row = fill[n];
5566       while (row < prow) {
5567 	incrlev = im[row] + 1;
5568 	nz      = dloc[row];
5569 	xi      = ajnew  + ainew[row] + nz + 1;
5570 	flev    = ajfill + ainew[row] + nz + 1;
5571 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5572 	fm      = row;
5573 	while (nnz-- > 0) {
5574 	  idx = *xi++;
5575 	  if (*flev + incrlev > levels) {
5576 	    flev++;
5577 	    continue;
5578 	  }
5579 	  do {
5580 	    m  = fm;
5581 	    fm = fill[m];
5582 	  } while (fm < idx);
5583 	  if (fm != idx) {
5584 	    im[idx]   = *flev + incrlev;
5585 	    fill[m]   = idx;
5586 	    fill[idx] = fm;
5587 	    fm        = idx;
5588 	    nzf++;
5589 	  } else {
5590 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5591 	  }
5592 	  flev++;
5593 	}
5594 	row = fill[row];
5595 	nzi++;
5596       }
5597       /* copy new filled row into permanent storage */
5598       ainew[prow+1] = ainew[prow] + nzf;
5599       if (ainew[prow+1] > jmax) {
5600 
5601 	/* estimate how much additional space we will need */
5602 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5603 	/* just double the memory each time */
5604 	PetscInt maxadd = jmax;
5605 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5606 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5607 	jmax += maxadd;
5608 
5609 	/* allocate a longer ajnew and ajfill */
5610 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5611 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5612 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5613 	ajnew = xitmp;
5614 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5615 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5616 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5617 	ajfill = xitmp;
5618 	reallocate++; /* count how many reallocations are needed */
5619       }
5620       xitmp       = ajnew + ainew[prow];
5621       flev        = ajfill + ainew[prow];
5622       dloc[prow]  = nzi;
5623       fm          = fill[n];
5624       while (nzf--) {
5625 	*xitmp++ = fm;
5626 	*flev++ = im[fm];
5627 	fm      = fill[fm];
5628       }
5629       /* make sure row has diagonal entry */
5630       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5631 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5632     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5633       }
5634     }
5635     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5636     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5637     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5638     ierr = PetscFree(fill);CHKERRQ(ierr);
5639     ierr = PetscFree(im);CHKERRQ(ierr);
5640 
5641 #if defined(PETSC_USE_INFO)
5642     {
5643       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5644       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5645       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5646       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5647       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5648       if (diagonal_fill) {
5649 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5650       }
5651     }
5652 #endif
5653 
5654     /* put together the new matrix */
5655     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5656     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5657     b    = (Mat_SeqBAIJ*)fact->data;
5658     b->free_a       = PETSC_TRUE;
5659     b->free_ij      = PETSC_TRUE;
5660     b->singlemalloc = PETSC_FALSE;
5661     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5662     b->j          = ajnew;
5663     b->i          = ainew;
5664     for (i=0; i<n; i++) dloc[i] += ainew[i];
5665     b->diag       = dloc;
5666     b->free_diag  = PETSC_TRUE;
5667     b->ilen       = 0;
5668     b->imax       = 0;
5669     b->row        = isrow;
5670     b->col        = iscol;
5671     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5672     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5673     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5674     b->icol       = isicol;
5675     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5676     /* In b structure:  Free imax, ilen, old a, old j.
5677        Allocate dloc, solve_work, new a, new j */
5678     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5679     b->maxnz          = b->nz = ainew[n];
5680 
5681     fact->info.factor_mallocs    = reallocate;
5682     fact->info.fill_ratio_given  = f;
5683     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5684 
5685   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5686   PetscFunctionReturn(0);
5687 }
5688 
5689 #undef __FUNCT__
5690 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5691 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5692 {
5693   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5694   /* int i,*AJ=a->j,nz=a->nz; */
5695   PetscFunctionBegin;
5696   /* Undo Column scaling */
5697 /*    while (nz--) { */
5698 /*      AJ[i] = AJ[i]/4; */
5699 /*    } */
5700   /* This should really invoke a push/pop logic, but we don't have that yet. */
5701   A->ops->setunfactored = PETSC_NULL;
5702   PetscFunctionReturn(0);
5703 }
5704 
5705 #undef __FUNCT__
5706 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5707 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5708 {
5709   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5710   PetscInt       *AJ=a->j,nz=a->nz;
5711   unsigned short *aj=(unsigned short *)AJ;
5712   PetscFunctionBegin;
5713   /* Is this really necessary? */
5714   while (nz--) {
5715     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5716   }
5717   A->ops->setunfactored = PETSC_NULL;
5718   PetscFunctionReturn(0);
5719 }
5720 
5721 
5722