xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision eeffb40d691afbdd57a8091619e7ddd44ac5fdca)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1179 {
1180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1181   IS             iscol=a->col,isrow=a->row;
1182   PetscErrorCode ierr;
1183   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1184   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1187   PetscScalar    *x,*b,*t;
1188 
1189   PetscFunctionBegin;
1190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192   t  = a->solve_work;
1193 
1194   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1195   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1196 
1197   /* forward solve the lower triangular */
1198   idx    = 7*(*r++);
1199   t[0] = b[idx];   t[1] = b[1+idx];
1200   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201   t[5] = b[5+idx]; t[6] = b[6+idx];
1202 
1203   for (i=1; i<n; i++) {
1204     v     = aa + 49*ai[i];
1205     vi    = aj + ai[i];
1206     nz    = diag[i] - ai[i];
1207     idx   = 7*(*r++);
1208     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213       x4    = t[3+idx];x5 = t[4+idx];
1214       x6    = t[5+idx];x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idx = 7*i;
1225     t[idx]   = s1;t[1+idx] = s2;
1226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227     t[5+idx] = s6;t[6+idx] = s7;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--){
1231     v    = aa + 49*diag[i] + 49;
1232     vi   = aj + diag[i] + 1;
1233     nz   = ai[i+1] - diag[i] - 1;
1234     idt  = 7*i;
1235     s1 = t[idt];  s2 = t[1+idt];
1236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237     s6 = t[5+idt];s7 = t[6+idt];
1238     while (nz--) {
1239       idx   = 7*(*vi++);
1240       x1    = t[idx];   x2 = t[1+idx];
1241       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242       x6    = t[5+idx]; x7 = t[6+idx];
1243       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1250       v += 49;
1251     }
1252     idc = 7*(*c--);
1253     v   = aa + 49*diag[i];
1254     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1268   }
1269 
1270   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1271   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1281 {
1282   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1283   IS             iscol=a->col,isrow=a->row;
1284   PetscErrorCode ierr;
1285   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1286   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1287   MatScalar      *aa=a->a,*v;
1288   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1289   PetscScalar    *x,*b,*t;
1290 
1291   PetscFunctionBegin;
1292   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1294   t  = a->solve_work;
1295 
1296   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1297   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1298 
1299   /* forward solve the lower triangular */
1300   idx    = 7*r[0];
1301   t[0] = b[idx];   t[1] = b[1+idx];
1302   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1303   t[5] = b[5+idx]; t[6] = b[6+idx];
1304 
1305   for (i=1; i<n; i++) {
1306     v     = aa + 49*ai[i];
1307     vi    = aj + ai[i];
1308     nz    = ai[i+1] - ai[i];
1309     idx   = 7*r[i];
1310     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1311     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1312     for(m=0;m<nz;m++){
1313       idx   = 7*vi[m];
1314       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1315       x4    = t[3+idx];x5 = t[4+idx];
1316       x6    = t[5+idx];x7 = t[6+idx];
1317       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1318       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1319       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1320       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1321       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1322       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1323       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1324       v += 49;
1325     }
1326     idx = 7*i;
1327     t[idx]   = s1;t[1+idx] = s2;
1328     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1329     t[5+idx] = s6;t[6+idx] = s7;
1330   }
1331   /* backward solve the upper triangular */
1332   for (i=n-1; i>=0; i--){
1333     k    = 2*n-i;
1334     v    = aa + 49*ai[k];
1335     vi   = aj + ai[k];
1336     nz   = ai[k+1] - ai[k] - 1;
1337     idt  = 7*i;
1338     s1 = t[idt];  s2 = t[1+idt];
1339     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1340     s6 = t[5+idt];s7 = t[6+idt];
1341     for(m=0;m<nz;m++){
1342       idx   = 7*vi[m];
1343       x1    = t[idx];   x2 = t[1+idx];
1344       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1345       x6    = t[5+idx]; x7 = t[6+idx];
1346       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1347       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1348       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1349       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1350       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1351       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1352       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1353       v += 49;
1354     }
1355     idc = 7*c[i];
1356     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1357                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1358     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1359                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1360     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1361                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1362     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1363                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1364     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1365                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1366     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1367                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1368     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1369                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1370   }
1371 
1372   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1373   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1374   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1383 {
1384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386   PetscErrorCode    ierr;
1387   PetscInt          *diag = a->diag,jdx;
1388   const MatScalar   *aa=a->a,*v;
1389   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390   const PetscScalar *b;
1391 
1392   PetscFunctionBegin;
1393   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1394   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1395   /* forward solve the lower triangular */
1396   idx    = 0;
1397   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1398   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1399   x[6] = b[6+idx];
1400   for (i=1; i<n; i++) {
1401     v     =  aa + 49*ai[i];
1402     vi    =  aj + ai[i];
1403     nz    =  diag[i] - ai[i];
1404     idx   =  7*i;
1405     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407     s7  =  b[6+idx];
1408     while (nz--) {
1409       jdx   = 7*(*vi++);
1410       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1411       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1412       x7    = x[6+jdx];
1413       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1420       v += 49;
1421      }
1422     x[idx]   = s1;
1423     x[1+idx] = s2;
1424     x[2+idx] = s3;
1425     x[3+idx] = s4;
1426     x[4+idx] = s5;
1427     x[5+idx] = s6;
1428     x[6+idx] = s7;
1429   }
1430   /* backward solve the upper triangular */
1431   for (i=n-1; i>=0; i--){
1432     v    = aa + 49*diag[i] + 49;
1433     vi   = aj + diag[i] + 1;
1434     nz   = ai[i+1] - diag[i] - 1;
1435     idt  = 7*i;
1436     s1 = x[idt];   s2 = x[1+idt];
1437     s3 = x[2+idt]; s4 = x[3+idt];
1438     s5 = x[4+idt]; s6 = x[5+idt];
1439     s7 = x[6+idt];
1440     while (nz--) {
1441       idx   = 7*(*vi++);
1442       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1443       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1444       x7    = x[6+idx];
1445       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1452       v += 49;
1453     }
1454     v        = aa + 49*diag[i];
1455     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1469   }
1470 
1471   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1472   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 #undef __FUNCT__
1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480 {
1481     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1482     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483     PetscErrorCode    ierr;
1484     PetscInt          idx,jdx,idt;
1485     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486     const MatScalar   *aa=a->a,*v;
1487     PetscScalar       *x;
1488     const PetscScalar *b;
1489     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490 
1491     PetscFunctionBegin;
1492     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494     /* forward solve the lower triangular */
1495     idx    = 0;
1496     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498     for (i=1; i<n; i++) {
1499        v    = aa + bs2*ai[i];
1500        vi   = aj + ai[i];
1501        nz   = ai[i+1] - ai[i];
1502       idx   = bs*i;
1503        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1505        for(k=0;k<nz;k++) {
1506           jdx   = bs*vi[k];
1507           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516           v   +=  bs2;
1517         }
1518 
1519        x[idx]   = s1;
1520        x[1+idx] = s2;
1521        x[2+idx] = s3;
1522        x[3+idx] = s4;
1523        x[4+idx] = s5;
1524        x[5+idx] = s6;
1525        x[6+idx] = s7;
1526     }
1527 
1528    /* backward solve the upper triangular */
1529   for (i=n-1; i>=0; i--){
1530      v   = aa + bs2*ai[2*n-i];
1531      vi  = aj + ai[2*n-i];
1532      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533      idt = bs*i;
1534      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1536     for(k=0;k<nz;k++) {
1537       idx   = bs*vi[k];
1538        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547         v   +=  bs2;
1548     }
1549     /* x = inv_diagonal*x */
1550     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557   }
1558 
1559   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
1567 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1568 {
1569     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1570     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1571     PetscErrorCode    ierr;
1572     PetscInt          idx,jdx,idt;
1573     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1574     const MatScalar   *aa=a->a,*v;
1575     PetscScalar       *x;
1576     const PetscScalar *b;
1577     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1578 
1579     PetscFunctionBegin;
1580     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1581     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1582     /* forward solve the lower triangular */
1583     idx    = 0;
1584     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1585     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1586     for (i=1; i<n; i++) {
1587        v    = aa + bs2*ai[i];
1588        vi   = aj + ai[i];
1589        nz   = ai[i+1] - ai[i];
1590       idx   = bs*i;
1591        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1592        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1593        for(k=0;k<nz;k++) {
1594           jdx   = bs*vi[k];
1595           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1596 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1597           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1598           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1599           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1600 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1601           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1602 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1603 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1604           v   +=  bs2;
1605         }
1606 
1607        x[idx]   = s1;
1608        x[1+idx] = s2;
1609        x[2+idx] = s3;
1610        x[3+idx] = s4;
1611        x[4+idx] = s5;
1612        x[5+idx] = s6;
1613        x[6+idx] = s7;
1614     }
1615 
1616    /* backward solve the upper triangular */
1617   for (i=n-1; i>=0; i--){
1618     v   = aa + bs2*(adiag[i+1]+1);
1619      vi  = aj + adiag[i+1]+1;
1620      nz  = adiag[i] - adiag[i+1]-1;
1621      idt = bs*i;
1622      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1623      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1624     for(k=0;k<nz;k++) {
1625       idx   = bs*vi[k];
1626        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1627        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1628        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1629        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1630        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1631        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1632        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1633        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1634        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1635         v   +=  bs2;
1636     }
1637     /* x = inv_diagonal*x */
1638     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1639     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1640     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1641     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1642     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1643     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1644     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1645   }
1646 
1647   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1648   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1649   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1650   PetscFunctionReturn(0);
1651 }
1652 
1653 #undef __FUNCT__
1654 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1655 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1656 {
1657   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1658   IS                iscol=a->col,isrow=a->row;
1659   PetscErrorCode    ierr;
1660   const PetscInt    *r,*c,*rout,*cout;
1661   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1662   const MatScalar   *aa=a->a,*v;
1663   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1664   const PetscScalar *b;
1665   PetscFunctionBegin;
1666   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1667   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668   t  = a->solve_work;
1669 
1670   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1671   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1672 
1673   /* forward solve the lower triangular */
1674   idx    = 6*(*r++);
1675   t[0] = b[idx];   t[1] = b[1+idx];
1676   t[2] = b[2+idx]; t[3] = b[3+idx];
1677   t[4] = b[4+idx]; t[5] = b[5+idx];
1678   for (i=1; i<n; i++) {
1679     v     = aa + 36*ai[i];
1680     vi    = aj + ai[i];
1681     nz    = diag[i] - ai[i];
1682     idx   = 6*(*r++);
1683     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1684     s5  = b[4+idx]; s6 = b[5+idx];
1685     while (nz--) {
1686       idx   = 6*(*vi++);
1687       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1688       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1689       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1690       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1691       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1692       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1693       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1694       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1695       v += 36;
1696     }
1697     idx = 6*i;
1698     t[idx]   = s1;t[1+idx] = s2;
1699     t[2+idx] = s3;t[3+idx] = s4;
1700     t[4+idx] = s5;t[5+idx] = s6;
1701   }
1702   /* backward solve the upper triangular */
1703   for (i=n-1; i>=0; i--){
1704     v    = aa + 36*diag[i] + 36;
1705     vi   = aj + diag[i] + 1;
1706     nz   = ai[i+1] - diag[i] - 1;
1707     idt  = 6*i;
1708     s1 = t[idt];  s2 = t[1+idt];
1709     s3 = t[2+idt];s4 = t[3+idt];
1710     s5 = t[4+idt];s6 = t[5+idt];
1711     while (nz--) {
1712       idx   = 6*(*vi++);
1713       x1    = t[idx];   x2 = t[1+idx];
1714       x3    = t[2+idx]; x4 = t[3+idx];
1715       x5    = t[4+idx]; x6 = t[5+idx];
1716       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1717       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1718       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1719       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1720       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1721       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1722       v += 36;
1723     }
1724     idc = 6*(*c--);
1725     v   = aa + 36*diag[i];
1726     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1727                                  v[18]*s4+v[24]*s5+v[30]*s6;
1728     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1729                                  v[19]*s4+v[25]*s5+v[31]*s6;
1730     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1731                                  v[20]*s4+v[26]*s5+v[32]*s6;
1732     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1733                                  v[21]*s4+v[27]*s5+v[33]*s6;
1734     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1735                                  v[22]*s4+v[28]*s5+v[34]*s6;
1736     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1737                                  v[23]*s4+v[29]*s5+v[35]*s6;
1738   }
1739 
1740   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1741   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1742   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1743   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1744   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 #undef __FUNCT__
1749 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1750 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1751 {
1752   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1753   IS                iscol=a->col,isrow=a->row;
1754   PetscErrorCode    ierr;
1755   const PetscInt    *r,*c,*rout,*cout;
1756   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1759   const PetscScalar *b;
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t  = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* forward solve the lower triangular */
1769   idx    = 6*r[0];
1770   t[0] = b[idx];   t[1] = b[1+idx];
1771   t[2] = b[2+idx]; t[3] = b[3+idx];
1772   t[4] = b[4+idx]; t[5] = b[5+idx];
1773   for (i=1; i<n; i++) {
1774     v     = aa + 36*ai[i];
1775     vi    = aj + ai[i];
1776     nz    = ai[i+1] - ai[i];
1777     idx   = 6*r[i];
1778     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1779     s5  = b[4+idx]; s6 = b[5+idx];
1780     for(m=0;m<nz;m++){
1781       idx   = 6*vi[m];
1782       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1783       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1784       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1785       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1786       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1787       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1788       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1789       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1790       v += 36;
1791     }
1792     idx = 6*i;
1793     t[idx]   = s1;t[1+idx] = s2;
1794     t[2+idx] = s3;t[3+idx] = s4;
1795     t[4+idx] = s5;t[5+idx] = s6;
1796   }
1797   /* backward solve the upper triangular */
1798   for (i=n-1; i>=0; i--){
1799     k    = 2*n-i;
1800     v    = aa + 36*ai[k];
1801     vi   = aj + ai[k];
1802     nz   = ai[k+1] - ai[k] - 1;
1803     idt  = 6*i;
1804     s1 = t[idt];  s2 = t[1+idt];
1805     s3 = t[2+idt];s4 = t[3+idt];
1806     s5 = t[4+idt];s6 = t[5+idt];
1807     for(m=0;m<nz;m++){
1808       idx   = 6*vi[m];
1809       x1    = t[idx];   x2 = t[1+idx];
1810       x3    = t[2+idx]; x4 = t[3+idx];
1811       x5    = t[4+idx]; x6 = t[5+idx];
1812       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1813       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1814       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1815       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1816       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1817       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1818       v += 36;
1819     }
1820     idc = 6*c[i];
1821     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1822                                  v[18]*s4+v[24]*s5+v[30]*s6;
1823     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1824                                  v[19]*s4+v[25]*s5+v[31]*s6;
1825     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1826                                  v[20]*s4+v[26]*s5+v[32]*s6;
1827     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1828                                  v[21]*s4+v[27]*s5+v[33]*s6;
1829     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1830                                  v[22]*s4+v[28]*s5+v[34]*s6;
1831     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1832                                  v[23]*s4+v[29]*s5+v[35]*s6;
1833   }
1834 
1835   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1836   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1839   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1840   PetscFunctionReturn(0);
1841 }
1842 
1843 #undef __FUNCT__
1844 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2"
1845 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1846 {
1847   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1848   IS                iscol=a->col,isrow=a->row;
1849   PetscErrorCode    ierr;
1850   const PetscInt    *r,*c,*rout,*cout;
1851   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1854   const PetscScalar *b;
1855   PetscFunctionBegin;
1856   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858   t  = a->solve_work;
1859 
1860   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862 
1863   /* forward solve the lower triangular */
1864   idx    = 6*r[0];
1865   t[0] = b[idx];   t[1] = b[1+idx];
1866   t[2] = b[2+idx]; t[3] = b[3+idx];
1867   t[4] = b[4+idx]; t[5] = b[5+idx];
1868   for (i=1; i<n; i++) {
1869     v     = aa + 36*ai[i];
1870     vi    = aj + ai[i];
1871     nz    = ai[i+1] - ai[i];
1872     idx   = 6*r[i];
1873     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1874     s5  = b[4+idx]; s6 = b[5+idx];
1875     for(m=0;m<nz;m++){
1876       idx   = 6*vi[m];
1877       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1878       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1879       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1880       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1881       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1882       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1883       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1884       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1885       v += 36;
1886     }
1887     idx = 6*i;
1888     t[idx]   = s1;t[1+idx] = s2;
1889     t[2+idx] = s3;t[3+idx] = s4;
1890     t[4+idx] = s5;t[5+idx] = s6;
1891   }
1892   /* backward solve the upper triangular */
1893   for (i=n-1; i>=0; i--){
1894     v    = aa + 36*(adiag[i+1]+1);
1895     vi   = aj + adiag[i+1]+1;
1896     nz   = adiag[i] - adiag[i+1] - 1;
1897     idt  = 6*i;
1898     s1 = t[idt];  s2 = t[1+idt];
1899     s3 = t[2+idt];s4 = t[3+idt];
1900     s5 = t[4+idt];s6 = t[5+idt];
1901     for(m=0;m<nz;m++){
1902       idx   = 6*vi[m];
1903       x1    = t[idx];   x2 = t[1+idx];
1904       x3    = t[2+idx]; x4 = t[3+idx];
1905       x5    = t[4+idx]; x6 = t[5+idx];
1906       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1907       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1908       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1909       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1910       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1911       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1912       v += 36;
1913     }
1914     idc = 6*c[i];
1915     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1916                                  v[18]*s4+v[24]*s5+v[30]*s6;
1917     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1918                                  v[19]*s4+v[25]*s5+v[31]*s6;
1919     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1920                                  v[20]*s4+v[26]*s5+v[32]*s6;
1921     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1922                                  v[21]*s4+v[27]*s5+v[33]*s6;
1923     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1924                                  v[22]*s4+v[28]*s5+v[34]*s6;
1925     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1926                                  v[23]*s4+v[29]*s5+v[35]*s6;
1927   }
1928 
1929   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1930   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1931   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1932   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1933   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 #undef __FUNCT__
1938 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1939 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1940 {
1941   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1942   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1943   PetscErrorCode    ierr;
1944   PetscInt          *diag = a->diag,jdx;
1945   const MatScalar   *aa=a->a,*v;
1946   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1947   const PetscScalar *b;
1948 
1949   PetscFunctionBegin;
1950   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1951   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1952   /* forward solve the lower triangular */
1953   idx    = 0;
1954   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1955   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1956   for (i=1; i<n; i++) {
1957     v     =  aa + 36*ai[i];
1958     vi    =  aj + ai[i];
1959     nz    =  diag[i] - ai[i];
1960     idx   =  6*i;
1961     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1962     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1963     while (nz--) {
1964       jdx   = 6*(*vi++);
1965       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1966       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1967       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1968       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1969       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1970       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1971       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1972       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1973       v += 36;
1974      }
1975     x[idx]   = s1;
1976     x[1+idx] = s2;
1977     x[2+idx] = s3;
1978     x[3+idx] = s4;
1979     x[4+idx] = s5;
1980     x[5+idx] = s6;
1981   }
1982   /* backward solve the upper triangular */
1983   for (i=n-1; i>=0; i--){
1984     v    = aa + 36*diag[i] + 36;
1985     vi   = aj + diag[i] + 1;
1986     nz   = ai[i+1] - diag[i] - 1;
1987     idt  = 6*i;
1988     s1 = x[idt];   s2 = x[1+idt];
1989     s3 = x[2+idt]; s4 = x[3+idt];
1990     s5 = x[4+idt]; s6 = x[5+idt];
1991     while (nz--) {
1992       idx   = 6*(*vi++);
1993       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1994       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1995       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1996       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1997       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1998       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1999       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2000       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2001       v += 36;
2002     }
2003     v        = aa + 36*diag[i];
2004     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2005     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2006     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2007     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2008     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2009     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2010   }
2011 
2012   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2013   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2014   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2015   PetscFunctionReturn(0);
2016 }
2017 
2018 #undef __FUNCT__
2019 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2020 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2021 {
2022     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2023     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2024     PetscErrorCode    ierr;
2025     PetscInt          idx,jdx,idt;
2026     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2027     const MatScalar   *aa=a->a,*v;
2028     PetscScalar       *x;
2029     const PetscScalar *b;
2030     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2031 
2032     PetscFunctionBegin;
2033     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2034     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2035     /* forward solve the lower triangular */
2036     idx    = 0;
2037     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2038     x[4] = b[4+idx];x[5] = b[5+idx];
2039     for (i=1; i<n; i++) {
2040        v    = aa + bs2*ai[i];
2041        vi   = aj + ai[i];
2042        nz   = ai[i+1] - ai[i];
2043       idx   = bs*i;
2044        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2045        s5   = b[4+idx];s6 = b[5+idx];
2046        for(k=0;k<nz;k++){
2047           jdx   = bs*vi[k];
2048           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2049 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2050           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2051           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2052           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2053 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2054           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2055 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2056           v   +=  bs2;
2057         }
2058 
2059        x[idx]   = s1;
2060        x[1+idx] = s2;
2061        x[2+idx] = s3;
2062        x[3+idx] = s4;
2063        x[4+idx] = s5;
2064        x[5+idx] = s6;
2065     }
2066 
2067    /* backward solve the upper triangular */
2068   for (i=n-1; i>=0; i--){
2069      v   = aa + bs2*ai[2*n-i];
2070      vi  = aj + ai[2*n-i];
2071      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2072      idt = bs*i;
2073      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2074      s5 = x[4+idt];s6 = x[5+idt];
2075      for(k=0;k<nz;k++){
2076       idx   = bs*vi[k];
2077        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2078        x5    = x[4+idx];x6 = x[5+idx];
2079        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2080        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2081        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2082        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2083        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2084        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2085         v   +=  bs2;
2086     }
2087     /* x = inv_diagonal*x */
2088    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2089    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2090    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2091    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2092    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2093    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2094   }
2095 
2096   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 #undef __FUNCT__
2103 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
2104 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2105 {
2106     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2107     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2108     PetscErrorCode    ierr;
2109     PetscInt          idx,jdx,idt;
2110     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2111     const MatScalar   *aa=a->a,*v;
2112     PetscScalar       *x;
2113     const PetscScalar *b;
2114     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2115 
2116     PetscFunctionBegin;
2117     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2118     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2119     /* forward solve the lower triangular */
2120     idx    = 0;
2121     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2122     x[4] = b[4+idx];x[5] = b[5+idx];
2123     for (i=1; i<n; i++) {
2124        v    = aa + bs2*ai[i];
2125        vi   = aj + ai[i];
2126        nz   = ai[i+1] - ai[i];
2127       idx   = bs*i;
2128        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2129        s5   = b[4+idx];s6 = b[5+idx];
2130        for(k=0;k<nz;k++){
2131           jdx   = bs*vi[k];
2132           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2133 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2134           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2135           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2136           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2137 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2138           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2139 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2140           v   +=  bs2;
2141         }
2142 
2143        x[idx]   = s1;
2144        x[1+idx] = s2;
2145        x[2+idx] = s3;
2146        x[3+idx] = s4;
2147        x[4+idx] = s5;
2148        x[5+idx] = s6;
2149     }
2150 
2151    /* backward solve the upper triangular */
2152   for (i=n-1; i>=0; i--){
2153     v   = aa + bs2*(adiag[i+1]+1);
2154      vi  = aj + adiag[i+1]+1;
2155      nz  = adiag[i] - adiag[i+1]-1;
2156      idt = bs*i;
2157      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2158      s5 = x[4+idt];s6 = x[5+idt];
2159      for(k=0;k<nz;k++){
2160       idx   = bs*vi[k];
2161        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2162        x5    = x[4+idx];x6 = x[5+idx];
2163        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2164        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2165        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2166        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2167        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2168        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2169         v   +=  bs2;
2170     }
2171     /* x = inv_diagonal*x */
2172    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2173    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2174    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2175    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2176    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2177    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2178   }
2179 
2180   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2182   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 #undef __FUNCT__
2187 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2188 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2189 {
2190   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2191   IS                iscol=a->col,isrow=a->row;
2192   PetscErrorCode    ierr;
2193   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2194   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2195   const MatScalar   *aa=a->a,*v;
2196   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2197   const PetscScalar *b;
2198 
2199   PetscFunctionBegin;
2200   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2202   t  = a->solve_work;
2203 
2204   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2205   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2206 
2207   /* forward solve the lower triangular */
2208   idx    = 5*(*r++);
2209   t[0] = b[idx];   t[1] = b[1+idx];
2210   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2211   for (i=1; i<n; i++) {
2212     v     = aa + 25*ai[i];
2213     vi    = aj + ai[i];
2214     nz    = diag[i] - ai[i];
2215     idx   = 5*(*r++);
2216     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2217     s5  = b[4+idx];
2218     while (nz--) {
2219       idx   = 5*(*vi++);
2220       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2221       x4    = t[3+idx];x5 = t[4+idx];
2222       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2223       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2224       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2225       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2226       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2227       v += 25;
2228     }
2229     idx = 5*i;
2230     t[idx]   = s1;t[1+idx] = s2;
2231     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2232   }
2233   /* backward solve the upper triangular */
2234   for (i=n-1; i>=0; i--){
2235     v    = aa + 25*diag[i] + 25;
2236     vi   = aj + diag[i] + 1;
2237     nz   = ai[i+1] - diag[i] - 1;
2238     idt  = 5*i;
2239     s1 = t[idt];  s2 = t[1+idt];
2240     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2241     while (nz--) {
2242       idx   = 5*(*vi++);
2243       x1    = t[idx];   x2 = t[1+idx];
2244       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2245       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2246       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2247       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2248       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2249       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2250       v += 25;
2251     }
2252     idc = 5*(*c--);
2253     v   = aa + 25*diag[i];
2254     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2255                                  v[15]*s4+v[20]*s5;
2256     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2257                                  v[16]*s4+v[21]*s5;
2258     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2259                                  v[17]*s4+v[22]*s5;
2260     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2261                                  v[18]*s4+v[23]*s5;
2262     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2263                                  v[19]*s4+v[24]*s5;
2264   }
2265 
2266   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2267   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2268   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2269   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2270   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2271   PetscFunctionReturn(0);
2272 }
2273 
2274 #undef __FUNCT__
2275 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2276 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2277 {
2278   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2279   IS                iscol=a->col,isrow=a->row;
2280   PetscErrorCode    ierr;
2281   const PetscInt    *r,*c,*rout,*cout;
2282   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2283   const MatScalar   *aa=a->a,*v;
2284   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2285   const PetscScalar *b;
2286 
2287   PetscFunctionBegin;
2288   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2290   t  = a->solve_work;
2291 
2292   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2293   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2294 
2295   /* forward solve the lower triangular */
2296   idx    = 5*r[0];
2297   t[0] = b[idx];   t[1] = b[1+idx];
2298   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2299   for (i=1; i<n; i++) {
2300     v     = aa + 25*ai[i];
2301     vi    = aj + ai[i];
2302     nz    = ai[i+1] - ai[i];
2303     idx   = 5*r[i];
2304     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2305     s5  = b[4+idx];
2306     for(m=0;m<nz;m++){
2307       idx   = 5*vi[m];
2308       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2309       x4    = t[3+idx];x5 = t[4+idx];
2310       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2311       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2312       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2313       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2314       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2315       v += 25;
2316     }
2317     idx = 5*i;
2318     t[idx]   = s1;t[1+idx] = s2;
2319     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2320   }
2321   /* backward solve the upper triangular */
2322   for (i=n-1; i>=0; i--){
2323     k    = 2*n-i;
2324     v    = aa + 25*ai[k];
2325     vi   = aj + ai[k];
2326     nz   = ai[k+1] - ai[k] - 1;
2327     idt  = 5*i;
2328     s1 = t[idt];  s2 = t[1+idt];
2329     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2330     for(m=0;m<nz;m++){
2331       idx   = 5*vi[m];
2332       x1    = t[idx];   x2 = t[1+idx];
2333       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2334       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2335       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2336       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2337       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2338       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2339       v += 25;
2340     }
2341     idc = 5*c[i];
2342     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2343                                  v[15]*s4+v[20]*s5;
2344     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2345                                  v[16]*s4+v[21]*s5;
2346     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2347                                  v[17]*s4+v[22]*s5;
2348     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2349                                  v[18]*s4+v[23]*s5;
2350     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2351                                  v[19]*s4+v[24]*s5;
2352   }
2353 
2354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2356   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2358   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2359   PetscFunctionReturn(0);
2360 }
2361 
2362 #undef __FUNCT__
2363 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
2364 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2365 {
2366   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2367   IS                iscol=a->col,isrow=a->row;
2368   PetscErrorCode    ierr;
2369   const PetscInt    *r,*c,*rout,*cout;
2370   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2371   const MatScalar   *aa=a->a,*v;
2372   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2373   const PetscScalar *b;
2374 
2375   PetscFunctionBegin;
2376   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2377   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2378   t  = a->solve_work;
2379 
2380   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2381   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2382 
2383   /* forward solve the lower triangular */
2384   idx    = 5*r[0];
2385   t[0] = b[idx];   t[1] = b[1+idx];
2386   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2387   for (i=1; i<n; i++) {
2388     v     = aa + 25*ai[i];
2389     vi    = aj + ai[i];
2390     nz    = ai[i+1] - ai[i];
2391     idx   = 5*r[i];
2392     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2393     s5  = b[4+idx];
2394     for(m=0;m<nz;m++){
2395       idx   = 5*vi[m];
2396       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2397       x4    = t[3+idx];x5 = t[4+idx];
2398       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2399       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2400       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2401       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2402       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2403       v += 25;
2404     }
2405     idx = 5*i;
2406     t[idx]   = s1;t[1+idx] = s2;
2407     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2408   }
2409   /* backward solve the upper triangular */
2410   for (i=n-1; i>=0; i--){
2411     v    = aa + 25*(adiag[i+1]+1);
2412     vi   = aj + adiag[i+1]+1;
2413     nz   = adiag[i] - adiag[i+1] - 1;
2414     idt  = 5*i;
2415     s1 = t[idt];  s2 = t[1+idt];
2416     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2417     for(m=0;m<nz;m++){
2418       idx   = 5*vi[m];
2419       x1    = t[idx];   x2 = t[1+idx];
2420       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2421       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2422       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2423       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2424       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2425       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2426       v += 25;
2427     }
2428     idc = 5*c[i];
2429     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2430                                  v[15]*s4+v[20]*s5;
2431     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2432                                  v[16]*s4+v[21]*s5;
2433     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2434                                  v[17]*s4+v[22]*s5;
2435     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2436                                  v[18]*s4+v[23]*s5;
2437     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2438                                  v[19]*s4+v[24]*s5;
2439   }
2440 
2441   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2442   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2443   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2444   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2445   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2446   PetscFunctionReturn(0);
2447 }
2448 
2449 #undef __FUNCT__
2450 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2451 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2452 {
2453   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2455   PetscErrorCode    ierr;
2456   PetscInt          *diag = a->diag,jdx;
2457   const MatScalar   *aa=a->a,*v;
2458   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2459   const PetscScalar *b;
2460 
2461   PetscFunctionBegin;
2462   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2464   /* forward solve the lower triangular */
2465   idx    = 0;
2466   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2467   for (i=1; i<n; i++) {
2468     v     =  aa + 25*ai[i];
2469     vi    =  aj + ai[i];
2470     nz    =  diag[i] - ai[i];
2471     idx   =  5*i;
2472     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2473     while (nz--) {
2474       jdx   = 5*(*vi++);
2475       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2476       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2477       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2478       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2479       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2480       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2481       v    += 25;
2482     }
2483     x[idx]   = s1;
2484     x[1+idx] = s2;
2485     x[2+idx] = s3;
2486     x[3+idx] = s4;
2487     x[4+idx] = s5;
2488   }
2489   /* backward solve the upper triangular */
2490   for (i=n-1; i>=0; i--){
2491     v    = aa + 25*diag[i] + 25;
2492     vi   = aj + diag[i] + 1;
2493     nz   = ai[i+1] - diag[i] - 1;
2494     idt  = 5*i;
2495     s1 = x[idt];  s2 = x[1+idt];
2496     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2497     while (nz--) {
2498       idx   = 5*(*vi++);
2499       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2500       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2501       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2502       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2503       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2504       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2505       v    += 25;
2506     }
2507     v        = aa + 25*diag[i];
2508     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2509     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2510     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2511     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2512     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2513   }
2514 
2515   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2516   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2517   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2518   PetscFunctionReturn(0);
2519 }
2520 
2521 #undef __FUNCT__
2522 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2523 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2524 {
2525   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2526   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2527   PetscErrorCode    ierr;
2528   PetscInt          jdx;
2529   const MatScalar   *aa=a->a,*v;
2530   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2531   const PetscScalar *b;
2532 
2533   PetscFunctionBegin;
2534   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2536   /* forward solve the lower triangular */
2537   idx    = 0;
2538   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2539   for (i=1; i<n; i++) {
2540     v   = aa + 25*ai[i];
2541     vi  = aj + ai[i];
2542     nz  = ai[i+1] - ai[i];
2543     idx = 5*i;
2544     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2545     for(k=0;k<nz;k++) {
2546       jdx   = 5*vi[k];
2547       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2548       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2549       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2550       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2551       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2552       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2553       v    += 25;
2554     }
2555     x[idx]   = s1;
2556     x[1+idx] = s2;
2557     x[2+idx] = s3;
2558     x[3+idx] = s4;
2559     x[4+idx] = s5;
2560   }
2561 
2562   /* backward solve the upper triangular */
2563   for (i=n-1; i>=0; i--){
2564     v   = aa + 25*ai[2*n-i];
2565     vi  = aj + ai[2*n-i];
2566     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2567     idt = 5*i;
2568     s1 = x[idt];  s2 = x[1+idt];
2569     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2570     for(k=0;k<nz;k++){
2571       idx   = 5*vi[k];
2572       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2573       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2574       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2575       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2576       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2577       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2578       v    += 25;
2579     }
2580     /* x = inv_diagonal*x */
2581     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2582     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2583     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2584     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2585     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2586   }
2587 
2588   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2589   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2590   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2591   PetscFunctionReturn(0);
2592 }
2593 
2594 #undef __FUNCT__
2595 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
2596 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2597 {
2598   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2599   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2600   PetscErrorCode    ierr;
2601   PetscInt          jdx;
2602   const MatScalar   *aa=a->a,*v;
2603   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2604   const PetscScalar *b;
2605 
2606   PetscFunctionBegin;
2607   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2608   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2609   /* forward solve the lower triangular */
2610   idx    = 0;
2611   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2612   for (i=1; i<n; i++) {
2613     v   = aa + 25*ai[i];
2614     vi  = aj + ai[i];
2615     nz  = ai[i+1] - ai[i];
2616     idx = 5*i;
2617     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2618     for(k=0;k<nz;k++) {
2619       jdx   = 5*vi[k];
2620       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2621       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2622       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2623       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2624       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2625       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2626       v    += 25;
2627     }
2628     x[idx]   = s1;
2629     x[1+idx] = s2;
2630     x[2+idx] = s3;
2631     x[3+idx] = s4;
2632     x[4+idx] = s5;
2633   }
2634 
2635   /* backward solve the upper triangular */
2636   for (i=n-1; i>=0; i--){
2637     v   = aa + 25*(adiag[i+1]+1);
2638     vi  = aj + adiag[i+1]+1;
2639     nz  = adiag[i] - adiag[i+1]-1;
2640     idt = 5*i;
2641     s1 = x[idt];  s2 = x[1+idt];
2642     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2643     for(k=0;k<nz;k++){
2644       idx   = 5*vi[k];
2645       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2646       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2647       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2648       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2649       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2650       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2651       v    += 25;
2652     }
2653     /* x = inv_diagonal*x */
2654     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2655     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2656     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2657     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2658     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2659   }
2660 
2661   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2662   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2663   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2664   PetscFunctionReturn(0);
2665 }
2666 
2667 #undef __FUNCT__
2668 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2669 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2670 {
2671   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2672   IS                iscol=a->col,isrow=a->row;
2673   PetscErrorCode    ierr;
2674   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2675   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2676   const MatScalar   *aa=a->a,*v;
2677   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2678   const PetscScalar *b;
2679 
2680   PetscFunctionBegin;
2681   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2683   t  = a->solve_work;
2684 
2685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2687 
2688   /* forward solve the lower triangular */
2689   idx    = 4*(*r++);
2690   t[0] = b[idx];   t[1] = b[1+idx];
2691   t[2] = b[2+idx]; t[3] = b[3+idx];
2692   for (i=1; i<n; i++) {
2693     v     = aa + 16*ai[i];
2694     vi    = aj + ai[i];
2695     nz    = diag[i] - ai[i];
2696     idx   = 4*(*r++);
2697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2698     while (nz--) {
2699       idx   = 4*(*vi++);
2700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2705       v    += 16;
2706     }
2707     idx        = 4*i;
2708     t[idx]   = s1;t[1+idx] = s2;
2709     t[2+idx] = s3;t[3+idx] = s4;
2710   }
2711   /* backward solve the upper triangular */
2712   for (i=n-1; i>=0; i--){
2713     v    = aa + 16*diag[i] + 16;
2714     vi   = aj + diag[i] + 1;
2715     nz   = ai[i+1] - diag[i] - 1;
2716     idt  = 4*i;
2717     s1 = t[idt];  s2 = t[1+idt];
2718     s3 = t[2+idt];s4 = t[3+idt];
2719     while (nz--) {
2720       idx   = 4*(*vi++);
2721       x1    = t[idx];   x2 = t[1+idx];
2722       x3    = t[2+idx]; x4 = t[3+idx];
2723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2727       v += 16;
2728     }
2729     idc      = 4*(*c--);
2730     v        = aa + 16*diag[i];
2731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2735   }
2736 
2737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2739   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2741   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2742   PetscFunctionReturn(0);
2743 }
2744 
2745 #undef __FUNCT__
2746 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2747 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2748 {
2749   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2750   IS                iscol=a->col,isrow=a->row;
2751   PetscErrorCode    ierr;
2752   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2753   const PetscInt    *r,*c,*rout,*cout;
2754   const MatScalar   *aa=a->a,*v;
2755   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2756   const PetscScalar *b;
2757 
2758   PetscFunctionBegin;
2759   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2760   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2761   t  = a->solve_work;
2762 
2763   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2764   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2765 
2766   /* forward solve the lower triangular */
2767   idx    = 4*r[0];
2768   t[0] = b[idx];   t[1] = b[1+idx];
2769   t[2] = b[2+idx]; t[3] = b[3+idx];
2770   for (i=1; i<n; i++) {
2771     v     = aa + 16*ai[i];
2772     vi    = aj + ai[i];
2773     nz    = ai[i+1] - ai[i];
2774     idx   = 4*r[i];
2775     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2776     for(m=0;m<nz;m++){
2777       idx   = 4*vi[m];
2778       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2779       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2780       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2781       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2782       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2783       v    += 16;
2784     }
2785     idx        = 4*i;
2786     t[idx]   = s1;t[1+idx] = s2;
2787     t[2+idx] = s3;t[3+idx] = s4;
2788   }
2789   /* backward solve the upper triangular */
2790   for (i=n-1; i>=0; i--){
2791     k    = 2*n-i;
2792     v    = aa + 16*ai[k];
2793     vi   = aj + ai[k];
2794     nz   = ai[k+1] - ai[k] - 1;
2795     idt  = 4*i;
2796     s1 = t[idt];  s2 = t[1+idt];
2797     s3 = t[2+idt];s4 = t[3+idt];
2798     for(m=0;m<nz;m++){
2799       idx   = 4*vi[m];
2800       x1    = t[idx];   x2 = t[1+idx];
2801       x3    = t[2+idx]; x4 = t[3+idx];
2802       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2803       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2804       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2805       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2806       v += 16;
2807     }
2808     idc      = 4*c[i];
2809     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2810     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2811     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2812     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2813   }
2814 
2815   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2816   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2817   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2818   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2819   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2820   PetscFunctionReturn(0);
2821 }
2822 
2823 #undef __FUNCT__
2824 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
2825 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2826 {
2827   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2828   IS                iscol=a->col,isrow=a->row;
2829   PetscErrorCode    ierr;
2830   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2831   const PetscInt    *r,*c,*rout,*cout;
2832   const MatScalar   *aa=a->a,*v;
2833   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2834   const PetscScalar *b;
2835 
2836   PetscFunctionBegin;
2837   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2838   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2839   t  = a->solve_work;
2840 
2841   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2842   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2843 
2844   /* forward solve the lower triangular */
2845   idx    = 4*r[0];
2846   t[0] = b[idx];   t[1] = b[1+idx];
2847   t[2] = b[2+idx]; t[3] = b[3+idx];
2848   for (i=1; i<n; i++) {
2849     v     = aa + 16*ai[i];
2850     vi    = aj + ai[i];
2851     nz    = ai[i+1] - ai[i];
2852     idx   = 4*r[i];
2853     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2854     for(m=0;m<nz;m++){
2855       idx   = 4*vi[m];
2856       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2857       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2858       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2859       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2860       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2861       v    += 16;
2862     }
2863     idx        = 4*i;
2864     t[idx]   = s1;t[1+idx] = s2;
2865     t[2+idx] = s3;t[3+idx] = s4;
2866   }
2867   /* backward solve the upper triangular */
2868   for (i=n-1; i>=0; i--){
2869     v    = aa + 16*(adiag[i+1]+1);
2870     vi   = aj + adiag[i+1]+1;
2871     nz   = adiag[i] - adiag[i+1] - 1;
2872     idt  = 4*i;
2873     s1 = t[idt];  s2 = t[1+idt];
2874     s3 = t[2+idt];s4 = t[3+idt];
2875     for(m=0;m<nz;m++){
2876       idx   = 4*vi[m];
2877       x1    = t[idx];   x2 = t[1+idx];
2878       x3    = t[2+idx]; x4 = t[3+idx];
2879       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2880       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2881       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2882       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2883       v += 16;
2884     }
2885     idc      = 4*c[i];
2886     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2887     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2888     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2889     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2890   }
2891 
2892   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2893   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2894   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2895   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2896   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2897   PetscFunctionReturn(0);
2898 }
2899 
2900 #undef __FUNCT__
2901 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2902 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2903 {
2904   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2905   IS                iscol=a->col,isrow=a->row;
2906   PetscErrorCode    ierr;
2907   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2908   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2909   const MatScalar   *aa=a->a,*v;
2910   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2911   PetscScalar       *x;
2912   const PetscScalar *b;
2913 
2914   PetscFunctionBegin;
2915   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2916   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2917   t  = (MatScalar *)a->solve_work;
2918 
2919   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2920   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2921 
2922   /* forward solve the lower triangular */
2923   idx    = 4*(*r++);
2924   t[0] = (MatScalar)b[idx];
2925   t[1] = (MatScalar)b[1+idx];
2926   t[2] = (MatScalar)b[2+idx];
2927   t[3] = (MatScalar)b[3+idx];
2928   for (i=1; i<n; i++) {
2929     v     = aa + 16*ai[i];
2930     vi    = aj + ai[i];
2931     nz    = diag[i] - ai[i];
2932     idx   = 4*(*r++);
2933     s1 = (MatScalar)b[idx];
2934     s2 = (MatScalar)b[1+idx];
2935     s3 = (MatScalar)b[2+idx];
2936     s4 = (MatScalar)b[3+idx];
2937     while (nz--) {
2938       idx   = 4*(*vi++);
2939       x1  = t[idx];
2940       x2  = t[1+idx];
2941       x3  = t[2+idx];
2942       x4  = t[3+idx];
2943       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2944       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2945       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2946       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2947       v    += 16;
2948     }
2949     idx        = 4*i;
2950     t[idx]   = s1;
2951     t[1+idx] = s2;
2952     t[2+idx] = s3;
2953     t[3+idx] = s4;
2954   }
2955   /* backward solve the upper triangular */
2956   for (i=n-1; i>=0; i--){
2957     v    = aa + 16*diag[i] + 16;
2958     vi   = aj + diag[i] + 1;
2959     nz   = ai[i+1] - diag[i] - 1;
2960     idt  = 4*i;
2961     s1 = t[idt];
2962     s2 = t[1+idt];
2963     s3 = t[2+idt];
2964     s4 = t[3+idt];
2965     while (nz--) {
2966       idx   = 4*(*vi++);
2967       x1  = t[idx];
2968       x2  = t[1+idx];
2969       x3  = t[2+idx];
2970       x4  = t[3+idx];
2971       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2972       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2973       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2974       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2975       v += 16;
2976     }
2977     idc      = 4*(*c--);
2978     v        = aa + 16*diag[i];
2979     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2980     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2981     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2982     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2983     x[idc]   = (PetscScalar)t[idt];
2984     x[1+idc] = (PetscScalar)t[1+idt];
2985     x[2+idc] = (PetscScalar)t[2+idt];
2986     x[3+idc] = (PetscScalar)t[3+idt];
2987  }
2988 
2989   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2990   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2991   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2993   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2994   PetscFunctionReturn(0);
2995 }
2996 
2997 #if defined (PETSC_HAVE_SSE)
2998 
2999 #include PETSC_HAVE_SSE
3000 
3001 #undef __FUNCT__
3002 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3003 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3004 {
3005   /*
3006      Note: This code uses demotion of double
3007      to float when performing the mixed-mode computation.
3008      This may not be numerically reasonable for all applications.
3009   */
3010   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3011   IS             iscol=a->col,isrow=a->row;
3012   PetscErrorCode ierr;
3013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3014   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3015   MatScalar      *aa=a->a,*v;
3016   PetscScalar    *x,*b,*t;
3017 
3018   /* Make space in temp stack for 16 Byte Aligned arrays */
3019   float           ssealignedspace[11],*tmps,*tmpx;
3020   unsigned long   offset;
3021 
3022   PetscFunctionBegin;
3023   SSE_SCOPE_BEGIN;
3024 
3025     offset = (unsigned long)ssealignedspace % 16;
3026     if (offset) offset = (16 - offset)/4;
3027     tmps = &ssealignedspace[offset];
3028     tmpx = &ssealignedspace[offset+4];
3029     PREFETCH_NTA(aa+16*ai[1]);
3030 
3031     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3032     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3033     t  = a->solve_work;
3034 
3035     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3036     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3037 
3038     /* forward solve the lower triangular */
3039     idx  = 4*(*r++);
3040     t[0] = b[idx];   t[1] = b[1+idx];
3041     t[2] = b[2+idx]; t[3] = b[3+idx];
3042     v    =  aa + 16*ai[1];
3043 
3044     for (i=1; i<n;) {
3045       PREFETCH_NTA(&v[8]);
3046       vi   =  aj      + ai[i];
3047       nz   =  diag[i] - ai[i];
3048       idx  =  4*(*r++);
3049 
3050       /* Demote sum from double to float */
3051       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3052       LOAD_PS(tmps,XMM7);
3053 
3054       while (nz--) {
3055         PREFETCH_NTA(&v[16]);
3056         idx = 4*(*vi++);
3057 
3058         /* Demote solution (so far) from double to float */
3059         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3060 
3061         /* 4x4 Matrix-Vector product with negative accumulation: */
3062         SSE_INLINE_BEGIN_2(tmpx,v)
3063           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3064 
3065           /* First Column */
3066           SSE_COPY_PS(XMM0,XMM6)
3067           SSE_SHUFFLE(XMM0,XMM0,0x00)
3068           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3069           SSE_SUB_PS(XMM7,XMM0)
3070 
3071           /* Second Column */
3072           SSE_COPY_PS(XMM1,XMM6)
3073           SSE_SHUFFLE(XMM1,XMM1,0x55)
3074           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3075           SSE_SUB_PS(XMM7,XMM1)
3076 
3077           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3078 
3079           /* Third Column */
3080           SSE_COPY_PS(XMM2,XMM6)
3081           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3082           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3083           SSE_SUB_PS(XMM7,XMM2)
3084 
3085           /* Fourth Column */
3086           SSE_COPY_PS(XMM3,XMM6)
3087           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3088           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3089           SSE_SUB_PS(XMM7,XMM3)
3090         SSE_INLINE_END_2
3091 
3092         v  += 16;
3093       }
3094       idx = 4*i;
3095       v   = aa + 16*ai[++i];
3096       PREFETCH_NTA(v);
3097       STORE_PS(tmps,XMM7);
3098 
3099       /* Promote result from float to double */
3100       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3101     }
3102     /* backward solve the upper triangular */
3103     idt  = 4*(n-1);
3104     ai16 = 16*diag[n-1];
3105     v    = aa + ai16 + 16;
3106     for (i=n-1; i>=0;){
3107       PREFETCH_NTA(&v[8]);
3108       vi = aj + diag[i] + 1;
3109       nz = ai[i+1] - diag[i] - 1;
3110 
3111       /* Demote accumulator from double to float */
3112       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3113       LOAD_PS(tmps,XMM7);
3114 
3115       while (nz--) {
3116         PREFETCH_NTA(&v[16]);
3117         idx = 4*(*vi++);
3118 
3119         /* Demote solution (so far) from double to float */
3120         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3121 
3122         /* 4x4 Matrix-Vector Product with negative accumulation: */
3123         SSE_INLINE_BEGIN_2(tmpx,v)
3124           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3125 
3126           /* First Column */
3127           SSE_COPY_PS(XMM0,XMM6)
3128           SSE_SHUFFLE(XMM0,XMM0,0x00)
3129           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3130           SSE_SUB_PS(XMM7,XMM0)
3131 
3132           /* Second Column */
3133           SSE_COPY_PS(XMM1,XMM6)
3134           SSE_SHUFFLE(XMM1,XMM1,0x55)
3135           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3136           SSE_SUB_PS(XMM7,XMM1)
3137 
3138           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3139 
3140           /* Third Column */
3141           SSE_COPY_PS(XMM2,XMM6)
3142           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3143           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3144           SSE_SUB_PS(XMM7,XMM2)
3145 
3146           /* Fourth Column */
3147           SSE_COPY_PS(XMM3,XMM6)
3148           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3149           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3150           SSE_SUB_PS(XMM7,XMM3)
3151         SSE_INLINE_END_2
3152         v  += 16;
3153       }
3154       v    = aa + ai16;
3155       ai16 = 16*diag[--i];
3156       PREFETCH_NTA(aa+ai16+16);
3157       /*
3158          Scale the result by the diagonal 4x4 block,
3159          which was inverted as part of the factorization
3160       */
3161       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3162         /* First Column */
3163         SSE_COPY_PS(XMM0,XMM7)
3164         SSE_SHUFFLE(XMM0,XMM0,0x00)
3165         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3166 
3167         /* Second Column */
3168         SSE_COPY_PS(XMM1,XMM7)
3169         SSE_SHUFFLE(XMM1,XMM1,0x55)
3170         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3171         SSE_ADD_PS(XMM0,XMM1)
3172 
3173         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3174 
3175         /* Third Column */
3176         SSE_COPY_PS(XMM2,XMM7)
3177         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3178         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3179         SSE_ADD_PS(XMM0,XMM2)
3180 
3181         /* Fourth Column */
3182         SSE_COPY_PS(XMM3,XMM7)
3183         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3184         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3185         SSE_ADD_PS(XMM0,XMM3)
3186 
3187         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3188       SSE_INLINE_END_3
3189 
3190       /* Promote solution from float to double */
3191       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3192 
3193       /* Apply reordering to t and stream into x.    */
3194       /* This way, x doesn't pollute the cache.      */
3195       /* Be careful with size: 2 doubles = 4 floats! */
3196       idc  = 4*(*c--);
3197       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3198         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3199         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3200         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3201         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3202         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3203         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3204       SSE_INLINE_END_2
3205       v    = aa + ai16 + 16;
3206       idt -= 4;
3207     }
3208 
3209     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3210     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3211     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3212     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3213     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3214   SSE_SCOPE_END;
3215   PetscFunctionReturn(0);
3216 }
3217 
3218 #endif
3219 
3220 
3221 /*
3222       Special case where the matrix was ILU(0) factored in the natural
3223    ordering. This eliminates the need for the column and row permutation.
3224 */
3225 #undef __FUNCT__
3226 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3227 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3228 {
3229   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3230   PetscInt          n=a->mbs;
3231   const PetscInt    *ai=a->i,*aj=a->j;
3232   PetscErrorCode    ierr;
3233   const PetscInt    *diag = a->diag;
3234   const MatScalar   *aa=a->a;
3235   PetscScalar       *x;
3236   const PetscScalar *b;
3237 
3238   PetscFunctionBegin;
3239   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3240   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3241 
3242 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3243   {
3244     static PetscScalar w[2000]; /* very BAD need to fix */
3245     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3246   }
3247 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3248   {
3249     static PetscScalar w[2000]; /* very BAD need to fix */
3250     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3251   }
3252 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3253   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3254 #else
3255   {
3256     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3257     const MatScalar *v;
3258     PetscInt        jdx,idt,idx,nz,i,ai16;
3259     const PetscInt  *vi;
3260 
3261   /* forward solve the lower triangular */
3262   idx    = 0;
3263   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3264   for (i=1; i<n; i++) {
3265     v     =  aa      + 16*ai[i];
3266     vi    =  aj      + ai[i];
3267     nz    =  diag[i] - ai[i];
3268     idx   +=  4;
3269     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3270     while (nz--) {
3271       jdx   = 4*(*vi++);
3272       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3273       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3274       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3275       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3276       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3277       v    += 16;
3278     }
3279     x[idx]   = s1;
3280     x[1+idx] = s2;
3281     x[2+idx] = s3;
3282     x[3+idx] = s4;
3283   }
3284   /* backward solve the upper triangular */
3285   idt = 4*(n-1);
3286   for (i=n-1; i>=0; i--){
3287     ai16 = 16*diag[i];
3288     v    = aa + ai16 + 16;
3289     vi   = aj + diag[i] + 1;
3290     nz   = ai[i+1] - diag[i] - 1;
3291     s1 = x[idt];  s2 = x[1+idt];
3292     s3 = x[2+idt];s4 = x[3+idt];
3293     while (nz--) {
3294       idx   = 4*(*vi++);
3295       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3296       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3297       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3298       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3299       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3300       v    += 16;
3301     }
3302     v        = aa + ai16;
3303     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3304     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3305     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3306     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3307     idt -= 4;
3308   }
3309   }
3310 #endif
3311 
3312   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3313   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3314   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3315   PetscFunctionReturn(0);
3316 }
3317 
3318 #undef __FUNCT__
3319 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3320 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3321 {
3322     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3323     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3324     PetscErrorCode    ierr;
3325     PetscInt          idx,jdx,idt;
3326     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3327     const MatScalar   *aa=a->a,*v;
3328     PetscScalar       *x;
3329     const PetscScalar *b;
3330     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3331 
3332     PetscFunctionBegin;
3333     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3334     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3335     /* forward solve the lower triangular */
3336     idx    = 0;
3337     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3338     for (i=1; i<n; i++) {
3339        v    = aa + bs2*ai[i];
3340        vi   = aj + ai[i];
3341        nz   = ai[i+1] - ai[i];
3342       idx   = bs*i;
3343        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3344       for(k=0;k<nz;k++) {
3345           jdx   = bs*vi[k];
3346           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3347           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3348           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3349           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3350 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3351 
3352           v   +=  bs2;
3353         }
3354 
3355        x[idx]   = s1;
3356        x[1+idx] = s2;
3357        x[2+idx] = s3;
3358        x[3+idx] = s4;
3359     }
3360 
3361    /* backward solve the upper triangular */
3362   for (i=n-1; i>=0; i--){
3363      v   = aa + bs2*ai[2*n-i];
3364      vi  = aj + ai[2*n-i];
3365      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3366      idt = bs*i;
3367      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3368 
3369     for(k=0;k<nz;k++){
3370       idx   = bs*vi[k];
3371        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3372        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3373        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3374        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3375        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3376 
3377         v   +=  bs2;
3378     }
3379     /* x = inv_diagonal*x */
3380    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3381    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3382    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3383    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3384 
3385   }
3386 
3387   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3388   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3389   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3390   PetscFunctionReturn(0);
3391 }
3392 
3393 #undef __FUNCT__
3394 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3395 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3396 {
3397     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3398     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3399     PetscErrorCode    ierr;
3400     PetscInt          idx,jdx,idt;
3401     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3402     const MatScalar   *aa=a->a,*v;
3403     PetscScalar       *x;
3404     const PetscScalar *b;
3405     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3406 
3407     PetscFunctionBegin;
3408     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3409     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3410     /* forward solve the lower triangular */
3411     idx    = 0;
3412     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3413     for (i=1; i<n; i++) {
3414        v    = aa + bs2*ai[i];
3415        vi   = aj + ai[i];
3416        nz   = ai[i+1] - ai[i];
3417       idx   = bs*i;
3418        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3419       for(k=0;k<nz;k++) {
3420           jdx   = bs*vi[k];
3421           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3422           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3423           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3424           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3425 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3426 
3427           v   +=  bs2;
3428         }
3429 
3430        x[idx]   = s1;
3431        x[1+idx] = s2;
3432        x[2+idx] = s3;
3433        x[3+idx] = s4;
3434     }
3435 
3436    /* backward solve the upper triangular */
3437   for (i=n-1; i>=0; i--){
3438     v   = aa + bs2*(adiag[i+1]+1);
3439      vi  = aj + adiag[i+1]+1;
3440      nz  = adiag[i] - adiag[i+1]-1;
3441      idt = bs*i;
3442      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3443 
3444     for(k=0;k<nz;k++){
3445       idx   = bs*vi[k];
3446        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3447        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3448        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3449        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3450        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3451 
3452         v   +=  bs2;
3453     }
3454     /* x = inv_diagonal*x */
3455    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3456    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3457    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3458    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3459 
3460   }
3461 
3462   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3463   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3464   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3465   PetscFunctionReturn(0);
3466 }
3467 
3468 #undef __FUNCT__
3469 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3470 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3471 {
3472   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3473   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3474   PetscErrorCode ierr;
3475   PetscInt       *diag = a->diag;
3476   MatScalar      *aa=a->a;
3477   PetscScalar    *x,*b;
3478 
3479   PetscFunctionBegin;
3480   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3481   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3482 
3483   {
3484     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3485     MatScalar  *v,*t=(MatScalar *)x;
3486     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3487 
3488     /* forward solve the lower triangular */
3489     idx  = 0;
3490     t[0] = (MatScalar)b[0];
3491     t[1] = (MatScalar)b[1];
3492     t[2] = (MatScalar)b[2];
3493     t[3] = (MatScalar)b[3];
3494     for (i=1; i<n; i++) {
3495       v     =  aa      + 16*ai[i];
3496       vi    =  aj      + ai[i];
3497       nz    =  diag[i] - ai[i];
3498       idx   +=  4;
3499       s1 = (MatScalar)b[idx];
3500       s2 = (MatScalar)b[1+idx];
3501       s3 = (MatScalar)b[2+idx];
3502       s4 = (MatScalar)b[3+idx];
3503       while (nz--) {
3504         jdx = 4*(*vi++);
3505         x1  = t[jdx];
3506         x2  = t[1+jdx];
3507         x3  = t[2+jdx];
3508         x4  = t[3+jdx];
3509         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3510         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3511         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3512         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3513         v    += 16;
3514       }
3515       t[idx]   = s1;
3516       t[1+idx] = s2;
3517       t[2+idx] = s3;
3518       t[3+idx] = s4;
3519     }
3520     /* backward solve the upper triangular */
3521     idt = 4*(n-1);
3522     for (i=n-1; i>=0; i--){
3523       ai16 = 16*diag[i];
3524       v    = aa + ai16 + 16;
3525       vi   = aj + diag[i] + 1;
3526       nz   = ai[i+1] - diag[i] - 1;
3527       s1   = t[idt];
3528       s2   = t[1+idt];
3529       s3   = t[2+idt];
3530       s4   = t[3+idt];
3531       while (nz--) {
3532         idx = 4*(*vi++);
3533         x1  = (MatScalar)x[idx];
3534         x2  = (MatScalar)x[1+idx];
3535         x3  = (MatScalar)x[2+idx];
3536         x4  = (MatScalar)x[3+idx];
3537         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3538         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3539         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3540         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3541         v    += 16;
3542       }
3543       v        = aa + ai16;
3544       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3545       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3546       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3547       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3548       idt -= 4;
3549     }
3550   }
3551 
3552   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3553   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3554   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3555   PetscFunctionReturn(0);
3556 }
3557 
3558 #if defined (PETSC_HAVE_SSE)
3559 
3560 #include PETSC_HAVE_SSE
3561 #undef __FUNCT__
3562 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3563 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3564 {
3565   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3566   unsigned short *aj=(unsigned short *)a->j;
3567   PetscErrorCode ierr;
3568   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3569   MatScalar      *aa=a->a;
3570   PetscScalar    *x,*b;
3571 
3572   PetscFunctionBegin;
3573   SSE_SCOPE_BEGIN;
3574   /*
3575      Note: This code currently uses demotion of double
3576      to float when performing the mixed-mode computation.
3577      This may not be numerically reasonable for all applications.
3578   */
3579   PREFETCH_NTA(aa+16*ai[1]);
3580 
3581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3583   {
3584     /* x will first be computed in single precision then promoted inplace to double */
3585     MatScalar      *v,*t=(MatScalar *)x;
3586     int            nz,i,idt,ai16;
3587     unsigned int   jdx,idx;
3588     unsigned short *vi;
3589     /* Forward solve the lower triangular factor. */
3590 
3591     /* First block is the identity. */
3592     idx  = 0;
3593     CONVERT_DOUBLE4_FLOAT4(t,b);
3594     v    =  aa + 16*((unsigned int)ai[1]);
3595 
3596     for (i=1; i<n;) {
3597       PREFETCH_NTA(&v[8]);
3598       vi   =  aj      + ai[i];
3599       nz   =  diag[i] - ai[i];
3600       idx +=  4;
3601 
3602       /* Demote RHS from double to float. */
3603       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3604       LOAD_PS(&t[idx],XMM7);
3605 
3606       while (nz--) {
3607         PREFETCH_NTA(&v[16]);
3608         jdx = 4*((unsigned int)(*vi++));
3609 
3610         /* 4x4 Matrix-Vector product with negative accumulation: */
3611         SSE_INLINE_BEGIN_2(&t[jdx],v)
3612           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3613 
3614           /* First Column */
3615           SSE_COPY_PS(XMM0,XMM6)
3616           SSE_SHUFFLE(XMM0,XMM0,0x00)
3617           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3618           SSE_SUB_PS(XMM7,XMM0)
3619 
3620           /* Second Column */
3621           SSE_COPY_PS(XMM1,XMM6)
3622           SSE_SHUFFLE(XMM1,XMM1,0x55)
3623           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3624           SSE_SUB_PS(XMM7,XMM1)
3625 
3626           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3627 
3628           /* Third Column */
3629           SSE_COPY_PS(XMM2,XMM6)
3630           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3631           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3632           SSE_SUB_PS(XMM7,XMM2)
3633 
3634           /* Fourth Column */
3635           SSE_COPY_PS(XMM3,XMM6)
3636           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3637           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3638           SSE_SUB_PS(XMM7,XMM3)
3639         SSE_INLINE_END_2
3640 
3641         v  += 16;
3642       }
3643       v    =  aa + 16*ai[++i];
3644       PREFETCH_NTA(v);
3645       STORE_PS(&t[idx],XMM7);
3646     }
3647 
3648     /* Backward solve the upper triangular factor.*/
3649 
3650     idt  = 4*(n-1);
3651     ai16 = 16*diag[n-1];
3652     v    = aa + ai16 + 16;
3653     for (i=n-1; i>=0;){
3654       PREFETCH_NTA(&v[8]);
3655       vi = aj + diag[i] + 1;
3656       nz = ai[i+1] - diag[i] - 1;
3657 
3658       LOAD_PS(&t[idt],XMM7);
3659 
3660       while (nz--) {
3661         PREFETCH_NTA(&v[16]);
3662         idx = 4*((unsigned int)(*vi++));
3663 
3664         /* 4x4 Matrix-Vector Product with negative accumulation: */
3665         SSE_INLINE_BEGIN_2(&t[idx],v)
3666           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3667 
3668           /* First Column */
3669           SSE_COPY_PS(XMM0,XMM6)
3670           SSE_SHUFFLE(XMM0,XMM0,0x00)
3671           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3672           SSE_SUB_PS(XMM7,XMM0)
3673 
3674           /* Second Column */
3675           SSE_COPY_PS(XMM1,XMM6)
3676           SSE_SHUFFLE(XMM1,XMM1,0x55)
3677           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3678           SSE_SUB_PS(XMM7,XMM1)
3679 
3680           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3681 
3682           /* Third Column */
3683           SSE_COPY_PS(XMM2,XMM6)
3684           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3685           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3686           SSE_SUB_PS(XMM7,XMM2)
3687 
3688           /* Fourth Column */
3689           SSE_COPY_PS(XMM3,XMM6)
3690           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3691           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3692           SSE_SUB_PS(XMM7,XMM3)
3693         SSE_INLINE_END_2
3694         v  += 16;
3695       }
3696       v    = aa + ai16;
3697       ai16 = 16*diag[--i];
3698       PREFETCH_NTA(aa+ai16+16);
3699       /*
3700          Scale the result by the diagonal 4x4 block,
3701          which was inverted as part of the factorization
3702       */
3703       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3704         /* First Column */
3705         SSE_COPY_PS(XMM0,XMM7)
3706         SSE_SHUFFLE(XMM0,XMM0,0x00)
3707         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3708 
3709         /* Second Column */
3710         SSE_COPY_PS(XMM1,XMM7)
3711         SSE_SHUFFLE(XMM1,XMM1,0x55)
3712         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3713         SSE_ADD_PS(XMM0,XMM1)
3714 
3715         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3716 
3717         /* Third Column */
3718         SSE_COPY_PS(XMM2,XMM7)
3719         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3720         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3721         SSE_ADD_PS(XMM0,XMM2)
3722 
3723         /* Fourth Column */
3724         SSE_COPY_PS(XMM3,XMM7)
3725         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3726         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3727         SSE_ADD_PS(XMM0,XMM3)
3728 
3729         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3730       SSE_INLINE_END_3
3731 
3732       v    = aa + ai16 + 16;
3733       idt -= 4;
3734     }
3735 
3736     /* Convert t from single precision back to double precision (inplace)*/
3737     idt = 4*(n-1);
3738     for (i=n-1;i>=0;i--) {
3739       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3740       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3741       PetscScalar *xtemp=&x[idt];
3742       MatScalar   *ttemp=&t[idt];
3743       xtemp[3] = (PetscScalar)ttemp[3];
3744       xtemp[2] = (PetscScalar)ttemp[2];
3745       xtemp[1] = (PetscScalar)ttemp[1];
3746       xtemp[0] = (PetscScalar)ttemp[0];
3747       idt -= 4;
3748     }
3749 
3750   } /* End of artificial scope. */
3751   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3753   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3754   SSE_SCOPE_END;
3755   PetscFunctionReturn(0);
3756 }
3757 
3758 #undef __FUNCT__
3759 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3760 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3761 {
3762   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3763   int            *aj=a->j;
3764   PetscErrorCode ierr;
3765   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3766   MatScalar      *aa=a->a;
3767   PetscScalar    *x,*b;
3768 
3769   PetscFunctionBegin;
3770   SSE_SCOPE_BEGIN;
3771   /*
3772      Note: This code currently uses demotion of double
3773      to float when performing the mixed-mode computation.
3774      This may not be numerically reasonable for all applications.
3775   */
3776   PREFETCH_NTA(aa+16*ai[1]);
3777 
3778   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3779   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3780   {
3781     /* x will first be computed in single precision then promoted inplace to double */
3782     MatScalar *v,*t=(MatScalar *)x;
3783     int       nz,i,idt,ai16;
3784     int       jdx,idx;
3785     int       *vi;
3786     /* Forward solve the lower triangular factor. */
3787 
3788     /* First block is the identity. */
3789     idx  = 0;
3790     CONVERT_DOUBLE4_FLOAT4(t,b);
3791     v    =  aa + 16*ai[1];
3792 
3793     for (i=1; i<n;) {
3794       PREFETCH_NTA(&v[8]);
3795       vi   =  aj      + ai[i];
3796       nz   =  diag[i] - ai[i];
3797       idx +=  4;
3798 
3799       /* Demote RHS from double to float. */
3800       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3801       LOAD_PS(&t[idx],XMM7);
3802 
3803       while (nz--) {
3804         PREFETCH_NTA(&v[16]);
3805         jdx = 4*(*vi++);
3806 /*          jdx = *vi++; */
3807 
3808         /* 4x4 Matrix-Vector product with negative accumulation: */
3809         SSE_INLINE_BEGIN_2(&t[jdx],v)
3810           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3811 
3812           /* First Column */
3813           SSE_COPY_PS(XMM0,XMM6)
3814           SSE_SHUFFLE(XMM0,XMM0,0x00)
3815           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3816           SSE_SUB_PS(XMM7,XMM0)
3817 
3818           /* Second Column */
3819           SSE_COPY_PS(XMM1,XMM6)
3820           SSE_SHUFFLE(XMM1,XMM1,0x55)
3821           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3822           SSE_SUB_PS(XMM7,XMM1)
3823 
3824           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3825 
3826           /* Third Column */
3827           SSE_COPY_PS(XMM2,XMM6)
3828           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3829           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3830           SSE_SUB_PS(XMM7,XMM2)
3831 
3832           /* Fourth Column */
3833           SSE_COPY_PS(XMM3,XMM6)
3834           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3835           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3836           SSE_SUB_PS(XMM7,XMM3)
3837         SSE_INLINE_END_2
3838 
3839         v  += 16;
3840       }
3841       v    =  aa + 16*ai[++i];
3842       PREFETCH_NTA(v);
3843       STORE_PS(&t[idx],XMM7);
3844     }
3845 
3846     /* Backward solve the upper triangular factor.*/
3847 
3848     idt  = 4*(n-1);
3849     ai16 = 16*diag[n-1];
3850     v    = aa + ai16 + 16;
3851     for (i=n-1; i>=0;){
3852       PREFETCH_NTA(&v[8]);
3853       vi = aj + diag[i] + 1;
3854       nz = ai[i+1] - diag[i] - 1;
3855 
3856       LOAD_PS(&t[idt],XMM7);
3857 
3858       while (nz--) {
3859         PREFETCH_NTA(&v[16]);
3860         idx = 4*(*vi++);
3861 /*          idx = *vi++; */
3862 
3863         /* 4x4 Matrix-Vector Product with negative accumulation: */
3864         SSE_INLINE_BEGIN_2(&t[idx],v)
3865           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3866 
3867           /* First Column */
3868           SSE_COPY_PS(XMM0,XMM6)
3869           SSE_SHUFFLE(XMM0,XMM0,0x00)
3870           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3871           SSE_SUB_PS(XMM7,XMM0)
3872 
3873           /* Second Column */
3874           SSE_COPY_PS(XMM1,XMM6)
3875           SSE_SHUFFLE(XMM1,XMM1,0x55)
3876           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3877           SSE_SUB_PS(XMM7,XMM1)
3878 
3879           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3880 
3881           /* Third Column */
3882           SSE_COPY_PS(XMM2,XMM6)
3883           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3884           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3885           SSE_SUB_PS(XMM7,XMM2)
3886 
3887           /* Fourth Column */
3888           SSE_COPY_PS(XMM3,XMM6)
3889           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3890           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3891           SSE_SUB_PS(XMM7,XMM3)
3892         SSE_INLINE_END_2
3893         v  += 16;
3894       }
3895       v    = aa + ai16;
3896       ai16 = 16*diag[--i];
3897       PREFETCH_NTA(aa+ai16+16);
3898       /*
3899          Scale the result by the diagonal 4x4 block,
3900          which was inverted as part of the factorization
3901       */
3902       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3903         /* First Column */
3904         SSE_COPY_PS(XMM0,XMM7)
3905         SSE_SHUFFLE(XMM0,XMM0,0x00)
3906         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3907 
3908         /* Second Column */
3909         SSE_COPY_PS(XMM1,XMM7)
3910         SSE_SHUFFLE(XMM1,XMM1,0x55)
3911         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3912         SSE_ADD_PS(XMM0,XMM1)
3913 
3914         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3915 
3916         /* Third Column */
3917         SSE_COPY_PS(XMM2,XMM7)
3918         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3919         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3920         SSE_ADD_PS(XMM0,XMM2)
3921 
3922         /* Fourth Column */
3923         SSE_COPY_PS(XMM3,XMM7)
3924         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3925         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3926         SSE_ADD_PS(XMM0,XMM3)
3927 
3928         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3929       SSE_INLINE_END_3
3930 
3931       v    = aa + ai16 + 16;
3932       idt -= 4;
3933     }
3934 
3935     /* Convert t from single precision back to double precision (inplace)*/
3936     idt = 4*(n-1);
3937     for (i=n-1;i>=0;i--) {
3938       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3939       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3940       PetscScalar *xtemp=&x[idt];
3941       MatScalar   *ttemp=&t[idt];
3942       xtemp[3] = (PetscScalar)ttemp[3];
3943       xtemp[2] = (PetscScalar)ttemp[2];
3944       xtemp[1] = (PetscScalar)ttemp[1];
3945       xtemp[0] = (PetscScalar)ttemp[0];
3946       idt -= 4;
3947     }
3948 
3949   } /* End of artificial scope. */
3950   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3951   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3952   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3953   SSE_SCOPE_END;
3954   PetscFunctionReturn(0);
3955 }
3956 
3957 #endif
3958 
3959 #undef __FUNCT__
3960 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3961 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3962 {
3963   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3964   IS                iscol=a->col,isrow=a->row;
3965   PetscErrorCode    ierr;
3966   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3967   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3968   const MatScalar   *aa=a->a,*v;
3969   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3970   const PetscScalar *b;
3971 
3972   PetscFunctionBegin;
3973   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3974   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3975   t  = a->solve_work;
3976 
3977   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3978   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3979 
3980   /* forward solve the lower triangular */
3981   idx    = 3*(*r++);
3982   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3983   for (i=1; i<n; i++) {
3984     v     = aa + 9*ai[i];
3985     vi    = aj + ai[i];
3986     nz    = diag[i] - ai[i];
3987     idx   = 3*(*r++);
3988     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3989     while (nz--) {
3990       idx   = 3*(*vi++);
3991       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3992       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3993       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3994       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3995       v += 9;
3996     }
3997     idx = 3*i;
3998     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3999   }
4000   /* backward solve the upper triangular */
4001   for (i=n-1; i>=0; i--){
4002     v    = aa + 9*diag[i] + 9;
4003     vi   = aj + diag[i] + 1;
4004     nz   = ai[i+1] - diag[i] - 1;
4005     idt  = 3*i;
4006     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4007     while (nz--) {
4008       idx   = 3*(*vi++);
4009       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4010       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4011       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4012       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4013       v += 9;
4014     }
4015     idc = 3*(*c--);
4016     v   = aa + 9*diag[i];
4017     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4018     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4019     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4020   }
4021   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4022   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4023   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4024   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4025   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4026   PetscFunctionReturn(0);
4027 }
4028 
4029 #undef __FUNCT__
4030 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4031 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4032 {
4033   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4034   IS                iscol=a->col,isrow=a->row;
4035   PetscErrorCode    ierr;
4036   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
4037   const PetscInt    *r,*c,*rout,*cout;
4038   const MatScalar   *aa=a->a,*v;
4039   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4040   const PetscScalar *b;
4041 
4042   PetscFunctionBegin;
4043   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4044   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4045   t  = a->solve_work;
4046 
4047   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4048   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4049 
4050   /* forward solve the lower triangular */
4051   idx    = 3*r[0];
4052   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4053   for (i=1; i<n; i++) {
4054     v     = aa + 9*ai[i];
4055     vi    = aj + ai[i];
4056     nz    = ai[i+1] - ai[i];
4057     idx   = 3*r[i];
4058     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4059     for(m=0;m<nz;m++){
4060       idx   = 3*vi[m];
4061       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4062       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4063       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4064       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4065       v += 9;
4066     }
4067     idx = 3*i;
4068     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4069   }
4070   /* backward solve the upper triangular */
4071   for (i=n-1; i>=0; i--){
4072     k    = 2*n-i;
4073     v    = aa + 9*ai[k];
4074     vi   = aj + ai[k];
4075     nz   = ai[k +1] - ai[k] - 1;
4076     idt  = 3*i;
4077     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4078     for(m=0;m<nz;m++){
4079       idx   = 3*vi[m];
4080       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4081       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4082       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4083       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4084       v += 9;
4085     }
4086     idc = 3*c[i];
4087     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4088     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4089     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4090   }
4091   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4092   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4093   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4094   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4095   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4096   PetscFunctionReturn(0);
4097 }
4098 
4099 #undef __FUNCT__
4100 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
4101 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4102 {
4103   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4104   IS                iscol=a->col,isrow=a->row;
4105   PetscErrorCode    ierr;
4106   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4107   const PetscInt    *r,*c,*rout,*cout;
4108   const MatScalar   *aa=a->a,*v;
4109   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4110   const PetscScalar *b;
4111 
4112   PetscFunctionBegin;
4113   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4114   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4115   t  = a->solve_work;
4116 
4117   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4118   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4119 
4120   /* forward solve the lower triangular */
4121   idx    = 3*r[0];
4122   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4123   for (i=1; i<n; i++) {
4124     v     = aa + 9*ai[i];
4125     vi    = aj + ai[i];
4126     nz    = ai[i+1] - ai[i];
4127     idx   = 3*r[i];
4128     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4129     for(m=0;m<nz;m++){
4130       idx   = 3*vi[m];
4131       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4132       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4133       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4134       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4135       v += 9;
4136     }
4137     idx = 3*i;
4138     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4139   }
4140   /* backward solve the upper triangular */
4141   for (i=n-1; i>=0; i--){
4142     v    = aa + 9*(adiag[i+1]+1);
4143     vi   = aj + adiag[i+1]+1;
4144     nz   = adiag[i] - adiag[i+1] - 1;
4145     idt  = 3*i;
4146     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4147     for(m=0;m<nz;m++){
4148       idx   = 3*vi[m];
4149       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4150       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4151       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4152       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4153       v += 9;
4154     }
4155     idc = 3*c[i];
4156     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4157     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4158     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4159   }
4160   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4161   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4162   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4164   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4165   PetscFunctionReturn(0);
4166 }
4167 
4168 /*
4169       Special case where the matrix was ILU(0) factored in the natural
4170    ordering. This eliminates the need for the column and row permutation.
4171 */
4172 #undef __FUNCT__
4173 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4174 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4175 {
4176   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4177   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4178   PetscErrorCode    ierr;
4179   PetscInt          *diag = a->diag;
4180   const MatScalar   *aa=a->a,*v;
4181   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4182   const PetscScalar *b;
4183   PetscInt          jdx,idt,idx,nz,*vi,i;
4184 
4185   PetscFunctionBegin;
4186   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4187   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4188 
4189   /* forward solve the lower triangular */
4190   idx    = 0;
4191   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4192   for (i=1; i<n; i++) {
4193     v     =  aa      + 9*ai[i];
4194     vi    =  aj      + ai[i];
4195     nz    =  diag[i] - ai[i];
4196     idx   +=  3;
4197     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4198     while (nz--) {
4199       jdx   = 3*(*vi++);
4200       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4201       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4202       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4203       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4204       v    += 9;
4205     }
4206     x[idx]   = s1;
4207     x[1+idx] = s2;
4208     x[2+idx] = s3;
4209   }
4210   /* backward solve the upper triangular */
4211   for (i=n-1; i>=0; i--){
4212     v    = aa + 9*diag[i] + 9;
4213     vi   = aj + diag[i] + 1;
4214     nz   = ai[i+1] - diag[i] - 1;
4215     idt  = 3*i;
4216     s1 = x[idt];  s2 = x[1+idt];
4217     s3 = x[2+idt];
4218     while (nz--) {
4219       idx   = 3*(*vi++);
4220       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4221       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4222       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4223       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4224       v    += 9;
4225     }
4226     v        = aa +  9*diag[i];
4227     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4228     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4229     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4230   }
4231 
4232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4234   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4235   PetscFunctionReturn(0);
4236 }
4237 
4238 #undef __FUNCT__
4239 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4240 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4241 {
4242     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4243     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4244     PetscErrorCode    ierr;
4245     PetscInt          idx,jdx,idt;
4246     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4247     const MatScalar   *aa=a->a,*v;
4248     PetscScalar       *x;
4249     const PetscScalar *b;
4250     PetscScalar        s1,s2,s3,x1,x2,x3;
4251 
4252     PetscFunctionBegin;
4253     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4254     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4255     /* forward solve the lower triangular */
4256     idx    = 0;
4257     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4258     for (i=1; i<n; i++) {
4259        v    = aa + bs2*ai[i];
4260        vi   = aj + ai[i];
4261        nz   = ai[i+1] - ai[i];
4262       idx   = bs*i;
4263        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4264       for(k=0;k<nz;k++){
4265          jdx   = bs*vi[k];
4266           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4267           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4268           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4269           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4270 
4271           v   +=  bs2;
4272         }
4273 
4274        x[idx]   = s1;
4275        x[1+idx] = s2;
4276        x[2+idx] = s3;
4277     }
4278 
4279    /* backward solve the upper triangular */
4280   for (i=n-1; i>=0; i--){
4281      v   = aa + bs2*ai[2*n-i];
4282      vi  = aj + ai[2*n-i];
4283      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4284      idt = bs*i;
4285      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4286 
4287      for(k=0;k<nz;k++){
4288        idx   = bs*vi[k];
4289        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4290        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4291        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4292        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4293 
4294         v   +=  bs2;
4295     }
4296     /* x = inv_diagonal*x */
4297    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4298    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4299    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4300 
4301   }
4302 
4303   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4304   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4305   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4306   PetscFunctionReturn(0);
4307 }
4308 
4309 #undef __FUNCT__
4310 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4311 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4312 {
4313     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4314     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4315     PetscErrorCode    ierr;
4316     PetscInt          idx,jdx,idt;
4317     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4318     const MatScalar   *aa=a->a,*v;
4319     PetscScalar       *x;
4320     const PetscScalar *b;
4321     PetscScalar        s1,s2,s3,x1,x2,x3;
4322 
4323     PetscFunctionBegin;
4324     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4325     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4326     /* forward solve the lower triangular */
4327     idx    = 0;
4328     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4329     for (i=1; i<n; i++) {
4330        v    = aa + bs2*ai[i];
4331        vi   = aj + ai[i];
4332        nz   = ai[i+1] - ai[i];
4333       idx   = bs*i;
4334        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4335       for(k=0;k<nz;k++){
4336          jdx   = bs*vi[k];
4337           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4338           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4339           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4340           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4341 
4342           v   +=  bs2;
4343         }
4344 
4345        x[idx]   = s1;
4346        x[1+idx] = s2;
4347        x[2+idx] = s3;
4348     }
4349 
4350    /* backward solve the upper triangular */
4351   for (i=n-1; i>=0; i--){
4352     v   = aa + bs2*(adiag[i+1]+1);
4353      vi  = aj + adiag[i+1]+1;
4354      nz  = adiag[i] - adiag[i+1]-1;
4355      idt = bs*i;
4356      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4357 
4358      for(k=0;k<nz;k++){
4359        idx   = bs*vi[k];
4360        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4361        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4362        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4363        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4364 
4365         v   +=  bs2;
4366     }
4367     /* x = inv_diagonal*x */
4368    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4369    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4370    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4371 
4372   }
4373 
4374   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4376   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4377   PetscFunctionReturn(0);
4378 }
4379 
4380 #undef __FUNCT__
4381 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4382 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4383 {
4384   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4385   IS                iscol=a->col,isrow=a->row;
4386   PetscErrorCode    ierr;
4387   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4388   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4389   const MatScalar   *aa=a->a,*v;
4390   PetscScalar       *x,s1,s2,x1,x2,*t;
4391   const PetscScalar *b;
4392 
4393   PetscFunctionBegin;
4394   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4396   t  = a->solve_work;
4397 
4398   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4399   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4400 
4401   /* forward solve the lower triangular */
4402   idx    = 2*(*r++);
4403   t[0] = b[idx]; t[1] = b[1+idx];
4404   for (i=1; i<n; i++) {
4405     v     = aa + 4*ai[i];
4406     vi    = aj + ai[i];
4407     nz    = diag[i] - ai[i];
4408     idx   = 2*(*r++);
4409     s1  = b[idx]; s2 = b[1+idx];
4410     while (nz--) {
4411       idx   = 2*(*vi++);
4412       x1    = t[idx]; x2 = t[1+idx];
4413       s1 -= v[0]*x1 + v[2]*x2;
4414       s2 -= v[1]*x1 + v[3]*x2;
4415       v += 4;
4416     }
4417     idx = 2*i;
4418     t[idx] = s1; t[1+idx] = s2;
4419   }
4420   /* backward solve the upper triangular */
4421   for (i=n-1; i>=0; i--){
4422     v    = aa + 4*diag[i] + 4;
4423     vi   = aj + diag[i] + 1;
4424     nz   = ai[i+1] - diag[i] - 1;
4425     idt  = 2*i;
4426     s1 = t[idt]; s2 = t[1+idt];
4427     while (nz--) {
4428       idx   = 2*(*vi++);
4429       x1    = t[idx]; x2 = t[1+idx];
4430       s1 -= v[0]*x1 + v[2]*x2;
4431       s2 -= v[1]*x1 + v[3]*x2;
4432       v += 4;
4433     }
4434     idc = 2*(*c--);
4435     v   = aa + 4*diag[i];
4436     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4437     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4438   }
4439   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4440   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4441   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4443   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4444   PetscFunctionReturn(0);
4445 }
4446 
4447 #undef __FUNCT__
4448 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4449 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4450 {
4451   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4452   IS                iscol=a->col,isrow=a->row;
4453   PetscErrorCode    ierr;
4454   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
4455   const PetscInt    *r,*c,*rout,*cout;
4456   const MatScalar   *aa=a->a,*v;
4457   PetscScalar       *x,s1,s2,x1,x2,*t;
4458   const PetscScalar *b;
4459 
4460   PetscFunctionBegin;
4461   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4462   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4463   t  = a->solve_work;
4464 
4465   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4466   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4467 
4468   /* forward solve the lower triangular */
4469   idx    = 2*r[0];
4470   t[0] = b[idx]; t[1] = b[1+idx];
4471   for (i=1; i<n; i++) {
4472     v     = aa + 4*ai[i];
4473     vi    = aj + ai[i];
4474     nz    = ai[i+1] - ai[i];
4475     idx   = 2*r[i];
4476     s1  = b[idx]; s2 = b[1+idx];
4477     for(m=0;m<nz;m++){
4478       jdx   = 2*vi[m];
4479       x1    = t[jdx]; x2 = t[1+jdx];
4480       s1 -= v[0]*x1 + v[2]*x2;
4481       s2 -= v[1]*x1 + v[3]*x2;
4482       v += 4;
4483     }
4484     idx = 2*i;
4485     t[idx] = s1; t[1+idx] = s2;
4486   }
4487   /* backward solve the upper triangular */
4488   for (i=n-1; i>=0; i--){
4489     k = 2*n-i;
4490     v    = aa + 4*ai[k];
4491     vi   = aj + ai[k];
4492     nz   = ai[k +1] - ai[k] - 1;
4493     idt  = 2*i;
4494     s1 = t[idt]; s2 = t[1+idt];
4495     for(m=0;m<nz;m++){
4496       idx   = 2*vi[m];
4497       x1    = t[idx]; x2 = t[1+idx];
4498       s1 -= v[0]*x1 + v[2]*x2;
4499       s2 -= v[1]*x1 + v[3]*x2;
4500       v += 4;
4501     }
4502     idc = 2*c[i];
4503     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4504     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4505   }
4506   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4507   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4508   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4509   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4510   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4511   PetscFunctionReturn(0);
4512 }
4513 
4514 #undef __FUNCT__
4515 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
4516 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4517 {
4518   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4519   IS                iscol=a->col,isrow=a->row;
4520   PetscErrorCode    ierr;
4521   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4522   const PetscInt    *r,*c,*rout,*cout;
4523   const MatScalar   *aa=a->a,*v;
4524   PetscScalar       *x,s1,s2,x1,x2,*t;
4525   const PetscScalar *b;
4526 
4527   PetscFunctionBegin;
4528   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4529   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4530   t  = a->solve_work;
4531 
4532   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4533   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4534 
4535   /* forward solve the lower triangular */
4536   idx    = 2*r[0];
4537   t[0] = b[idx]; t[1] = b[1+idx];
4538   for (i=1; i<n; i++) {
4539     v     = aa + 4*ai[i];
4540     vi    = aj + ai[i];
4541     nz    = ai[i+1] - ai[i];
4542     idx   = 2*r[i];
4543     s1  = b[idx]; s2 = b[1+idx];
4544     for(m=0;m<nz;m++){
4545       jdx   = 2*vi[m];
4546       x1    = t[jdx]; x2 = t[1+jdx];
4547       s1 -= v[0]*x1 + v[2]*x2;
4548       s2 -= v[1]*x1 + v[3]*x2;
4549       v += 4;
4550     }
4551     idx = 2*i;
4552     t[idx] = s1; t[1+idx] = s2;
4553   }
4554   /* backward solve the upper triangular */
4555   for (i=n-1; i>=0; i--){
4556     v    = aa + 4*(adiag[i+1]+1);
4557     vi   = aj + adiag[i+1]+1;
4558     nz   = adiag[i] - adiag[i+1] - 1;
4559     idt  = 2*i;
4560     s1 = t[idt]; s2 = t[1+idt];
4561     for(m=0;m<nz;m++){
4562       idx   = 2*vi[m];
4563       x1    = t[idx]; x2 = t[1+idx];
4564       s1 -= v[0]*x1 + v[2]*x2;
4565       s2 -= v[1]*x1 + v[3]*x2;
4566       v += 4;
4567     }
4568     idc = 2*c[i];
4569     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4570     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4571   }
4572   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4573   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4574   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4575   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4576   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4577   PetscFunctionReturn(0);
4578 }
4579 
4580 /*
4581       Special case where the matrix was ILU(0) factored in the natural
4582    ordering. This eliminates the need for the column and row permutation.
4583 */
4584 #undef __FUNCT__
4585 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4586 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4587 {
4588   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4589   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4590   PetscErrorCode    ierr;
4591   PetscInt          *diag = a->diag;
4592   const MatScalar   *aa=a->a,*v;
4593   PetscScalar       *x,s1,s2,x1,x2;
4594   const PetscScalar *b;
4595   PetscInt          jdx,idt,idx,nz,*vi,i;
4596 
4597   PetscFunctionBegin;
4598   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4600 
4601   /* forward solve the lower triangular */
4602   idx    = 0;
4603   x[0]   = b[0]; x[1] = b[1];
4604   for (i=1; i<n; i++) {
4605     v     =  aa      + 4*ai[i];
4606     vi    =  aj      + ai[i];
4607     nz    =  diag[i] - ai[i];
4608     idx   +=  2;
4609     s1  =  b[idx];s2 = b[1+idx];
4610     while (nz--) {
4611       jdx   = 2*(*vi++);
4612       x1    = x[jdx];x2 = x[1+jdx];
4613       s1 -= v[0]*x1 + v[2]*x2;
4614       s2 -= v[1]*x1 + v[3]*x2;
4615       v    += 4;
4616     }
4617     x[idx]   = s1;
4618     x[1+idx] = s2;
4619   }
4620   /* backward solve the upper triangular */
4621   for (i=n-1; i>=0; i--){
4622     v    = aa + 4*diag[i] + 4;
4623     vi   = aj + diag[i] + 1;
4624     nz   = ai[i+1] - diag[i] - 1;
4625     idt  = 2*i;
4626     s1 = x[idt];  s2 = x[1+idt];
4627     while (nz--) {
4628       idx   = 2*(*vi++);
4629       x1    = x[idx];   x2 = x[1+idx];
4630       s1 -= v[0]*x1 + v[2]*x2;
4631       s2 -= v[1]*x1 + v[3]*x2;
4632       v    += 4;
4633     }
4634     v        = aa +  4*diag[i];
4635     x[idt]   = v[0]*s1 + v[2]*s2;
4636     x[1+idt] = v[1]*s1 + v[3]*s2;
4637   }
4638 
4639   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4641   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4642   PetscFunctionReturn(0);
4643 }
4644 
4645 #undef __FUNCT__
4646 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4647 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4648 {
4649     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4650     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4651     PetscErrorCode    ierr;
4652     PetscInt          jdx;
4653     const MatScalar   *aa=a->a,*v;
4654     PetscScalar       *x,s1,s2,x1,x2;
4655     const PetscScalar *b;
4656 
4657     PetscFunctionBegin;
4658     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4659     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4660     /* forward solve the lower triangular */
4661     idx    = 0;
4662     x[0] = b[idx]; x[1] = b[1+idx];
4663     for (i=1; i<n; i++) {
4664         v   = aa + 4*ai[i];
4665        vi   = aj + ai[i];
4666        nz   = ai[i+1] - ai[i];
4667        idx  = 2*i;
4668        s1   = b[idx];s2 = b[1+idx];
4669       for(k=0;k<nz;k++){
4670          jdx   = 2*vi[k];
4671           x1    = x[jdx];x2 = x[1+jdx];
4672           s1   -= v[0]*x1 + v[2]*x2;
4673           s2   -= v[1]*x1 + v[3]*x2;
4674            v   +=  4;
4675         }
4676        x[idx]   = s1;
4677        x[1+idx] = s2;
4678     }
4679 
4680    /* backward solve the upper triangular */
4681   for (i=n-1; i>=0; i--){
4682      v   = aa + 4*ai[2*n-i];
4683      vi  = aj + ai[2*n-i];
4684      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4685      idt = 2*i;
4686      s1 = x[idt];  s2 = x[1+idt];
4687      for(k=0;k<nz;k++){
4688       idx   = 2*vi[k];
4689        x1    = x[idx];   x2 = x[1+idx];
4690        s1 -= v[0]*x1 + v[2]*x2;
4691        s2 -= v[1]*x1 + v[3]*x2;
4692          v    += 4;
4693     }
4694     /* x = inv_diagonal*x */
4695    x[idt]   = v[0]*s1 + v[2]*s2;
4696    x[1+idt] = v[1]*s1 + v[3]*s2;
4697   }
4698 
4699   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4700   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4701   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4702   PetscFunctionReturn(0);
4703 }
4704 
4705 #undef __FUNCT__
4706 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4707 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4708 {
4709     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4710     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4711     PetscErrorCode    ierr;
4712     PetscInt          jdx;
4713     const MatScalar   *aa=a->a,*v;
4714     PetscScalar       *x,s1,s2,x1,x2;
4715     const PetscScalar *b;
4716 
4717     PetscFunctionBegin;
4718     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4719     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4720     /* forward solve the lower triangular */
4721     idx    = 0;
4722     x[0] = b[idx]; x[1] = b[1+idx];
4723     for (i=1; i<n; i++) {
4724         v   = aa + 4*ai[i];
4725        vi   = aj + ai[i];
4726        nz   = ai[i+1] - ai[i];
4727        idx  = 2*i;
4728        s1   = b[idx];s2 = b[1+idx];
4729       for(k=0;k<nz;k++){
4730          jdx   = 2*vi[k];
4731           x1    = x[jdx];x2 = x[1+jdx];
4732           s1   -= v[0]*x1 + v[2]*x2;
4733           s2   -= v[1]*x1 + v[3]*x2;
4734            v   +=  4;
4735         }
4736        x[idx]   = s1;
4737        x[1+idx] = s2;
4738     }
4739 
4740    /* backward solve the upper triangular */
4741   for (i=n-1; i>=0; i--){
4742      v   = aa + 4*(adiag[i+1]+1);
4743      vi  = aj + adiag[i+1]+1;
4744      nz  = adiag[i] - adiag[i+1]-1;
4745      idt = 2*i;
4746      s1 = x[idt];  s2 = x[1+idt];
4747      for(k=0;k<nz;k++){
4748       idx   = 2*vi[k];
4749        x1    = x[idx];   x2 = x[1+idx];
4750        s1 -= v[0]*x1 + v[2]*x2;
4751        s2 -= v[1]*x1 + v[3]*x2;
4752          v    += 4;
4753     }
4754     /* x = inv_diagonal*x */
4755    x[idt]   = v[0]*s1 + v[2]*s2;
4756    x[1+idt] = v[1]*s1 + v[3]*s2;
4757   }
4758 
4759   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4760   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4761   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4762   PetscFunctionReturn(0);
4763 }
4764 
4765 #undef __FUNCT__
4766 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4767 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4768 {
4769   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4770   IS             iscol=a->col,isrow=a->row;
4771   PetscErrorCode ierr;
4772   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4773   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4774   MatScalar      *aa=a->a,*v;
4775   PetscScalar    *x,*b,s1,*t;
4776 
4777   PetscFunctionBegin;
4778   if (!n) PetscFunctionReturn(0);
4779 
4780   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4781   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4782   t  = a->solve_work;
4783 
4784   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4785   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4786 
4787   /* forward solve the lower triangular */
4788   t[0] = b[*r++];
4789   for (i=1; i<n; i++) {
4790     v     = aa + ai[i];
4791     vi    = aj + ai[i];
4792     nz    = diag[i] - ai[i];
4793     s1  = b[*r++];
4794     while (nz--) {
4795       s1 -= (*v++)*t[*vi++];
4796     }
4797     t[i] = s1;
4798   }
4799   /* backward solve the upper triangular */
4800   for (i=n-1; i>=0; i--){
4801     v    = aa + diag[i] + 1;
4802     vi   = aj + diag[i] + 1;
4803     nz   = ai[i+1] - diag[i] - 1;
4804     s1 = t[i];
4805     while (nz--) {
4806       s1 -= (*v++)*t[*vi++];
4807     }
4808     x[*c--] = t[i] = aa[diag[i]]*s1;
4809   }
4810 
4811   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4812   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4813   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4814   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4815   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4816   PetscFunctionReturn(0);
4817 }
4818 /*
4819       Special case where the matrix was ILU(0) factored in the natural
4820    ordering. This eliminates the need for the column and row permutation.
4821 */
4822 #undef __FUNCT__
4823 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4824 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4825 {
4826   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4827   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4828   PetscErrorCode ierr;
4829   PetscInt       *diag = a->diag;
4830   MatScalar      *aa=a->a;
4831   PetscScalar    *x,*b;
4832   PetscScalar    s1,x1;
4833   MatScalar      *v;
4834   PetscInt       jdx,idt,idx,nz,*vi,i;
4835 
4836   PetscFunctionBegin;
4837   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4838   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4839 
4840   /* forward solve the lower triangular */
4841   idx    = 0;
4842   x[0]   = b[0];
4843   for (i=1; i<n; i++) {
4844     v     =  aa      + ai[i];
4845     vi    =  aj      + ai[i];
4846     nz    =  diag[i] - ai[i];
4847     idx   +=  1;
4848     s1  =  b[idx];
4849     while (nz--) {
4850       jdx   = *vi++;
4851       x1    = x[jdx];
4852       s1 -= v[0]*x1;
4853       v    += 1;
4854     }
4855     x[idx]   = s1;
4856   }
4857   /* backward solve the upper triangular */
4858   for (i=n-1; i>=0; i--){
4859     v    = aa + diag[i] + 1;
4860     vi   = aj + diag[i] + 1;
4861     nz   = ai[i+1] - diag[i] - 1;
4862     idt  = i;
4863     s1 = x[idt];
4864     while (nz--) {
4865       idx   = *vi++;
4866       x1    = x[idx];
4867       s1 -= v[0]*x1;
4868       v    += 1;
4869     }
4870     v        = aa +  diag[i];
4871     x[idt]   = v[0]*s1;
4872   }
4873   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4874   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4875   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4876   PetscFunctionReturn(0);
4877 }
4878 
4879 /* ----------------------------------------------------------------*/
4880 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4881 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4882 
4883 #undef __FUNCT__
4884 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4885 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4886 {
4887   Mat            C=B;
4888   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4889   IS             isrow = b->row,isicol = b->icol;
4890   PetscErrorCode ierr;
4891   const PetscInt *r,*ic,*ics;
4892   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4893   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4894   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4895   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4896   MatScalar      *v_work;
4897 
4898   PetscFunctionBegin;
4899   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4900   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4901   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4902   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
4903   ics  = ic;
4904 
4905   /* generate work space needed by dense LU factorization */
4906   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4907   mwork    = v_work + bs;
4908   v_pivots = (PetscInt*)(mwork + bs2);
4909 
4910   for (i=0; i<n; i++){
4911     /* zero rtmp */
4912     /* L part */
4913     nz    = bi[i+1] - bi[i];
4914     bjtmp = bj + bi[i];
4915     for  (j=0; j<nz; j++){
4916       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4917     }
4918 
4919     /* U part */
4920     nz = bi[2*n-i+1] - bi[2*n-i];
4921     bjtmp = bj + bi[2*n-i];
4922     for  (j=0; j<nz; j++){
4923       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4924     }
4925 
4926     /* load in initial (unfactored row) */
4927     nz    = ai[r[i]+1] - ai[r[i]];
4928     ajtmp = aj + ai[r[i]];
4929     v     = aa + bs2*ai[r[i]];
4930     for (j=0; j<nz; j++) {
4931       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4932     }
4933 
4934     /* elimination */
4935     bjtmp = bj + bi[i];
4936     nzL   = bi[i+1] - bi[i];
4937     for(k=0;k < nzL;k++) {
4938       row = bjtmp[k];
4939       pc = rtmp + bs2*row;
4940       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4941       if (flg) {
4942         pv         = b->a + bs2*bdiag[row];
4943         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4944         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4945         pv         = b->a + bs2*bi[2*n-row];
4946         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4947         for (j=0; j<nz; j++) {
4948           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4949         }
4950         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4951       }
4952     }
4953 
4954     /* finished row so stick it into b->a */
4955     /* L part */
4956     pv   = b->a + bs2*bi[i] ;
4957     pj   = b->j + bi[i] ;
4958     nz   = bi[i+1] - bi[i];
4959     for (j=0; j<nz; j++) {
4960       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4961     }
4962 
4963     /* Mark diagonal and invert diagonal for simplier triangular solves */
4964     pv  = b->a + bs2*bdiag[i];
4965     pj  = b->j + bdiag[i];
4966     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4967     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4968     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4969 
4970     /* U part */
4971     pv = b->a + bs2*bi[2*n-i];
4972     pj = b->j + bi[2*n-i];
4973     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4974     for (j=0; j<nz; j++){
4975       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4976     }
4977   }
4978 
4979   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4980   ierr = PetscFree(v_work);CHKERRQ(ierr);
4981   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4982   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4983 
4984   C->assembled = PETSC_TRUE;
4985   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4986   PetscFunctionReturn(0);
4987 }
4988 
4989 #undef __FUNCT__
4990 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
4991 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
4992 {
4993   Mat            C=B;
4994   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4995   IS             isrow = b->row,isicol = b->icol;
4996   PetscErrorCode ierr;
4997   const PetscInt *r,*ic,*ics;
4998   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4999   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5000   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5001   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5002   MatScalar      *v_work;
5003 
5004   PetscFunctionBegin;
5005   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5006   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5007   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5008   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
5009   ics  = ic;
5010 
5011   /* generate work space needed by dense LU factorization */
5012   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
5013   mwork    = v_work + bs;
5014   v_pivots = (PetscInt*)(mwork + bs2);
5015 
5016   for (i=0; i<n; i++){
5017     /* zero rtmp */
5018     /* L part */
5019     nz    = bi[i+1] - bi[i];
5020     bjtmp = bj + bi[i];
5021     for  (j=0; j<nz; j++){
5022       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5023     }
5024 
5025     /* U part */
5026     nz = bdiag[i] - bdiag[i+1];
5027     bjtmp = bj + bdiag[i+1]+1;
5028     for  (j=0; j<nz; j++){
5029       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5030     }
5031 
5032     /* load in initial (unfactored row) */
5033     nz    = ai[r[i]+1] - ai[r[i]];
5034     ajtmp = aj + ai[r[i]];
5035     v     = aa + bs2*ai[r[i]];
5036     for (j=0; j<nz; j++) {
5037       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5038     }
5039 
5040     /* elimination */
5041     bjtmp = bj + bi[i];
5042     nzL   = bi[i+1] - bi[i];
5043     for(k=0;k < nzL;k++) {
5044       row = bjtmp[k];
5045       pc = rtmp + bs2*row;
5046       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5047       if (flg) {
5048         pv         = b->a + bs2*bdiag[row];
5049         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5050         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5051         pv         = b->a + bs2*(bdiag[row+1]+1);
5052         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5053         for (j=0; j<nz; j++) {
5054           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5055         }
5056         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5057       }
5058     }
5059 
5060     /* finished row so stick it into b->a */
5061     /* L part */
5062     pv   = b->a + bs2*bi[i] ;
5063     pj   = b->j + bi[i] ;
5064     nz   = bi[i+1] - bi[i];
5065     for (j=0; j<nz; j++) {
5066       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5067     }
5068 
5069     /* Mark diagonal and invert diagonal for simplier triangular solves */
5070     pv  = b->a + bs2*bdiag[i];
5071     pj  = b->j + bdiag[i];
5072     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5073     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5074     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5075 
5076     /* U part */
5077     pv = b->a + bs2*(bdiag[i+1]+1);
5078     pj = b->j + bdiag[i+1]+1;
5079     nz = bdiag[i] - bdiag[i+1] - 1;
5080     for (j=0; j<nz; j++){
5081       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5082     }
5083   }
5084 
5085   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5086   ierr = PetscFree(v_work);CHKERRQ(ierr);
5087   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5088   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5089 
5090   C->assembled = PETSC_TRUE;
5091   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5092   PetscFunctionReturn(0);
5093 }
5094 
5095 /*
5096    ilu(0) with natural ordering under new data structure.
5097    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5098    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5099 */
5100 #undef __FUNCT__
5101 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5102 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5103 {
5104 
5105   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5106   PetscErrorCode     ierr;
5107   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5108   PetscInt           i,j,nz,*bi,*bj,*bdiag;
5109 
5110   PetscFunctionBegin;
5111   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
5112   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5113   b    = (Mat_SeqBAIJ*)(fact)->data;
5114 
5115   /* allocate matrix arrays for new data structure */
5116   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
5117   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
5118   b->singlemalloc = PETSC_TRUE;
5119   if (!b->diag){
5120     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5121   }
5122   bdiag = b->diag;
5123 
5124   if (n > 0) {
5125     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5126   }
5127 
5128   /* set bi and bj with new data structure */
5129   bi = b->i;
5130   bj = b->j;
5131 
5132   /* L part */
5133   bi[0] = 0;
5134   for (i=0; i<n; i++){
5135     nz = adiag[i] - ai[i];
5136     bi[i+1] = bi[i] + nz;
5137     aj = a->j + ai[i];
5138     for (j=0; j<nz; j++){
5139       *bj = aj[j]; bj++;
5140     }
5141   }
5142 
5143   /* U part */
5144   bi[n+1] = bi[n];
5145   for (i=n-1; i>=0; i--){
5146     nz = ai[i+1] - adiag[i] - 1;
5147     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
5148     aj = a->j + adiag[i] + 1;
5149     for (j=0; j<nz; j++){
5150       *bj = aj[j]; bj++;
5151     }
5152     /* diag[i] */
5153     *bj = i; bj++;
5154     bdiag[i] = bi[2*n-i+1]-1;
5155   }
5156   PetscFunctionReturn(0);
5157 }
5158 
5159 #undef __FUNCT__
5160 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5161 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5162 {
5163   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5164   IS                 isicol;
5165   PetscErrorCode     ierr;
5166   const PetscInt     *r,*ic;
5167   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5168   PetscInt           *bi,*cols,nnz,*cols_lvl;
5169   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5170   PetscInt           i,levels,diagonal_fill;
5171   PetscTruth         col_identity,row_identity,both_identity;
5172   PetscReal          f;
5173   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5174   PetscBT            lnkbt;
5175   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5176   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5177   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5178   PetscTruth         missing;
5179   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5180 
5181   PetscFunctionBegin;
5182   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5183   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5184   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5185 
5186   f             = info->fill;
5187   levels        = (PetscInt)info->levels;
5188   diagonal_fill = (PetscInt)info->diagonal_fill;
5189   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5190 
5191   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5192   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5193   both_identity = (PetscTruth) (row_identity && col_identity);
5194 
5195   if (!levels && both_identity) {
5196     /* special case: ilu(0) with natural ordering */
5197     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5198     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5199     /* set MatSolve routines */
5200     switch (bs){
5201     case 2:
5202       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
5203       break;
5204     case 3:
5205       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
5206       break;
5207     case 4:
5208       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
5209       break;
5210     case 5:
5211       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
5212       break;
5213     case 6:
5214       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
5215       break;
5216     case 7:
5217       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
5218       break;
5219     default:
5220       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5221       break;
5222     }
5223 
5224     fact->factor = MAT_FACTOR_ILU;
5225     (fact)->info.factor_mallocs    = 0;
5226     (fact)->info.fill_ratio_given  = info->fill;
5227     (fact)->info.fill_ratio_needed = 1.0;
5228     b                = (Mat_SeqBAIJ*)(fact)->data;
5229     b->row           = isrow;
5230     b->col           = iscol;
5231     b->icol          = isicol;
5232     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5233     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5234     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5235     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5236     PetscFunctionReturn(0);
5237   }
5238 
5239   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5240   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5241 
5242   /* get new row pointers */
5243   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5244   bi[0] = 0;
5245   /* bdiag is location of diagonal in factor */
5246   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5247   bdiag[0]  = 0;
5248 
5249   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
5250   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
5251 
5252   /* create a linked list for storing column indices of the active row */
5253   nlnk = n + 1;
5254   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5255 
5256   /* initial FreeSpace size is f*(ai[n]+1) */
5257   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5258   current_space = free_space;
5259   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5260   current_space_lvl = free_space_lvl;
5261 
5262   for (i=0; i<n; i++) {
5263     nzi = 0;
5264     /* copy current row into linked list */
5265     nnz  = ai[r[i]+1] - ai[r[i]];
5266     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5267     cols = aj + ai[r[i]];
5268     lnk[i] = -1; /* marker to indicate if diagonal exists */
5269     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5270     nzi += nlnk;
5271 
5272     /* make sure diagonal entry is included */
5273     if (diagonal_fill && lnk[i] == -1) {
5274       fm = n;
5275       while (lnk[fm] < i) fm = lnk[fm];
5276       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5277       lnk[fm]    = i;
5278       lnk_lvl[i] = 0;
5279       nzi++; dcount++;
5280     }
5281 
5282     /* add pivot rows into the active row */
5283     nzbd = 0;
5284     prow = lnk[n];
5285     while (prow < i) {
5286       nnz      = bdiag[prow];
5287       cols     = bj_ptr[prow] + nnz + 1;
5288       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5289       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5290       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5291       nzi += nlnk;
5292       prow = lnk[prow];
5293       nzbd++;
5294     }
5295     bdiag[i] = nzbd;
5296     bi[i+1]  = bi[i] + nzi;
5297 
5298     /* if free space is not available, make more free space */
5299     if (current_space->local_remaining<nzi) {
5300       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5301       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5302       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5303       reallocs++;
5304     }
5305 
5306     /* copy data into free_space and free_space_lvl, then initialize lnk */
5307     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5308     bj_ptr[i]    = current_space->array;
5309     bjlvl_ptr[i] = current_space_lvl->array;
5310 
5311     /* make sure the active row i has diagonal entry */
5312     if (*(bj_ptr[i]+bdiag[i]) != i) {
5313       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5314     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5315     }
5316 
5317     current_space->array           += nzi;
5318     current_space->local_used      += nzi;
5319     current_space->local_remaining -= nzi;
5320     current_space_lvl->array           += nzi;
5321     current_space_lvl->local_used      += nzi;
5322     current_space_lvl->local_remaining -= nzi;
5323   }
5324 
5325   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5326   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5327 
5328   /* destroy list of free space and other temporary arrays */
5329   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5330 
5331   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5332   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5333 
5334   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5335   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5336   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
5337 
5338 #if defined(PETSC_USE_INFO)
5339   {
5340     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5341     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5342     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5343     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5344     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5345     if (diagonal_fill) {
5346       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5347     }
5348   }
5349 #endif
5350 
5351   /* put together the new matrix */
5352   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5353   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5354   b = (Mat_SeqBAIJ*)(fact)->data;
5355   b->free_a       = PETSC_TRUE;
5356   b->free_ij      = PETSC_TRUE;
5357   b->singlemalloc = PETSC_FALSE;
5358   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5359   b->j          = bj;
5360   b->i          = bi;
5361   b->diag       = bdiag;
5362   b->free_diag  = PETSC_TRUE;
5363   b->ilen       = 0;
5364   b->imax       = 0;
5365   b->row        = isrow;
5366   b->col        = iscol;
5367   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5368   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5369   b->icol       = isicol;
5370   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5371   /* In b structure:  Free imax, ilen, old a, old j.
5372      Allocate bdiag, solve_work, new a, new j */
5373   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5374   b->maxnz = b->nz = bi[2*n+1] ;
5375   (fact)->info.factor_mallocs    = reallocs;
5376   (fact)->info.fill_ratio_given  = f;
5377   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
5378   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5379   /* set MatSolve routines */
5380   if (both_identity){
5381     switch (bs){
5382     case 2:
5383       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
5384       break;
5385     case 3:
5386       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
5387       break;
5388     case 4:
5389       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
5390       break;
5391     case 5:
5392       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
5393       break;
5394     case 6:
5395       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
5396       break;
5397     case 7:
5398       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
5399       break;
5400     default:
5401       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5402       break;
5403     }
5404   } else {
5405     switch (bs){
5406     case 2:
5407       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
5408       break;
5409     case 3:
5410       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
5411       break;
5412     case 4:
5413       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
5414       break;
5415     case 5:
5416       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
5417       break;
5418     case 6:
5419       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
5420       break;
5421     case 7:
5422       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
5423       break;
5424     default:
5425       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5426       break;
5427     }
5428   }
5429   PetscFunctionReturn(0);
5430 }
5431 
5432 /*
5433      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5434    except that the data structure of Mat_SeqAIJ is slightly different.
5435    Not a good example of code reuse.
5436 */
5437 #undef __FUNCT__
5438 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5439 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5440 {
5441   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5442   IS             isicol;
5443   PetscErrorCode ierr;
5444   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5445   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5446   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5447   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5448   PetscTruth     col_identity,row_identity,both_identity,flg;
5449   PetscReal      f;
5450   PetscTruth     newdatastruct=PETSC_FALSE;
5451 
5452   PetscFunctionBegin;
5453   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5454   if (newdatastruct){
5455     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5456     PetscFunctionReturn(0);
5457   }
5458 
5459   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5460   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5461 
5462   f             = info->fill;
5463   levels        = (PetscInt)info->levels;
5464   diagonal_fill = (PetscInt)info->diagonal_fill;
5465   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5466 
5467   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5469   both_identity = (PetscTruth) (row_identity && col_identity);
5470 
5471   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5472     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5473     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5474 
5475     fact->factor = MAT_FACTOR_ILU;
5476     b            = (Mat_SeqBAIJ*)(fact)->data;
5477     b->row       = isrow;
5478     b->col       = iscol;
5479     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5480     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5481     b->icol      = isicol;
5482     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5483     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5484     PetscFunctionReturn(0);
5485   }
5486 
5487   /* general case perform the symbolic factorization */
5488     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5489     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5490 
5491     /* get new row pointers */
5492     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5493     ainew[0] = 0;
5494     /* don't know how many column pointers are needed so estimate */
5495     jmax = (PetscInt)(f*ai[n] + 1);
5496     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5497     /* ajfill is level of fill for each fill entry */
5498     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5499     /* fill is a linked list of nonzeros in active row */
5500     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5501     /* im is level for each filled value */
5502     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5503     /* dloc is location of diagonal in factor */
5504     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5505     dloc[0]  = 0;
5506     for (prow=0; prow<n; prow++) {
5507 
5508       /* copy prow into linked list */
5509       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5510       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5511       xi         = aj + ai[r[prow]];
5512       fill[n]    = n;
5513       fill[prow] = -1; /* marker for diagonal entry */
5514       while (nz--) {
5515 	fm  = n;
5516 	idx = ic[*xi++];
5517 	do {
5518 	  m  = fm;
5519 	  fm = fill[m];
5520 	} while (fm < idx);
5521 	fill[m]   = idx;
5522 	fill[idx] = fm;
5523 	im[idx]   = 0;
5524       }
5525 
5526       /* make sure diagonal entry is included */
5527       if (diagonal_fill && fill[prow] == -1) {
5528 	fm = n;
5529 	while (fill[fm] < prow) fm = fill[fm];
5530 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5531 	fill[fm]   = prow;
5532 	im[prow]   = 0;
5533 	nzf++;
5534 	dcount++;
5535       }
5536 
5537       nzi = 0;
5538       row = fill[n];
5539       while (row < prow) {
5540 	incrlev = im[row] + 1;
5541 	nz      = dloc[row];
5542 	xi      = ajnew  + ainew[row] + nz + 1;
5543 	flev    = ajfill + ainew[row] + nz + 1;
5544 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5545 	fm      = row;
5546 	while (nnz-- > 0) {
5547 	  idx = *xi++;
5548 	  if (*flev + incrlev > levels) {
5549 	    flev++;
5550 	    continue;
5551 	  }
5552 	  do {
5553 	    m  = fm;
5554 	    fm = fill[m];
5555 	  } while (fm < idx);
5556 	  if (fm != idx) {
5557 	    im[idx]   = *flev + incrlev;
5558 	    fill[m]   = idx;
5559 	    fill[idx] = fm;
5560 	    fm        = idx;
5561 	    nzf++;
5562 	  } else {
5563 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5564 	  }
5565 	  flev++;
5566 	}
5567 	row = fill[row];
5568 	nzi++;
5569       }
5570       /* copy new filled row into permanent storage */
5571       ainew[prow+1] = ainew[prow] + nzf;
5572       if (ainew[prow+1] > jmax) {
5573 
5574 	/* estimate how much additional space we will need */
5575 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5576 	/* just double the memory each time */
5577 	PetscInt maxadd = jmax;
5578 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5579 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5580 	jmax += maxadd;
5581 
5582 	/* allocate a longer ajnew and ajfill */
5583 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5584 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5585 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5586 	ajnew = xitmp;
5587 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5588 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5589 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5590 	ajfill = xitmp;
5591 	reallocate++; /* count how many reallocations are needed */
5592       }
5593       xitmp       = ajnew + ainew[prow];
5594       flev        = ajfill + ainew[prow];
5595       dloc[prow]  = nzi;
5596       fm          = fill[n];
5597       while (nzf--) {
5598 	*xitmp++ = fm;
5599 	*flev++ = im[fm];
5600 	fm      = fill[fm];
5601       }
5602       /* make sure row has diagonal entry */
5603       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5604 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5605     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5606       }
5607     }
5608     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5609     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5610     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5611     ierr = PetscFree(fill);CHKERRQ(ierr);
5612     ierr = PetscFree(im);CHKERRQ(ierr);
5613 
5614 #if defined(PETSC_USE_INFO)
5615     {
5616       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5617       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5618       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5619       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5620       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5621       if (diagonal_fill) {
5622 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5623       }
5624     }
5625 #endif
5626 
5627     /* put together the new matrix */
5628     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5629     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5630     b    = (Mat_SeqBAIJ*)(fact)->data;
5631     b->free_a       = PETSC_TRUE;
5632     b->free_ij      = PETSC_TRUE;
5633     b->singlemalloc = PETSC_FALSE;
5634     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5635     b->j          = ajnew;
5636     b->i          = ainew;
5637     for (i=0; i<n; i++) dloc[i] += ainew[i];
5638     b->diag       = dloc;
5639     b->free_diag  = PETSC_TRUE;
5640     b->ilen       = 0;
5641     b->imax       = 0;
5642     b->row        = isrow;
5643     b->col        = iscol;
5644     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5645     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5646     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5647     b->icol       = isicol;
5648     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5649     /* In b structure:  Free imax, ilen, old a, old j.
5650        Allocate dloc, solve_work, new a, new j */
5651     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5652     b->maxnz          = b->nz = ainew[n];
5653 
5654     (fact)->info.factor_mallocs    = reallocate;
5655     (fact)->info.fill_ratio_given  = f;
5656     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5657 
5658   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5659   PetscFunctionReturn(0);
5660 }
5661 
5662 #undef __FUNCT__
5663 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5664 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5665 {
5666   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5667   /* int i,*AJ=a->j,nz=a->nz; */
5668   PetscFunctionBegin;
5669   /* Undo Column scaling */
5670 /*    while (nz--) { */
5671 /*      AJ[i] = AJ[i]/4; */
5672 /*    } */
5673   /* This should really invoke a push/pop logic, but we don't have that yet. */
5674   A->ops->setunfactored = PETSC_NULL;
5675   PetscFunctionReturn(0);
5676 }
5677 
5678 #undef __FUNCT__
5679 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5680 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5681 {
5682   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5683   PetscInt       *AJ=a->j,nz=a->nz;
5684   unsigned short *aj=(unsigned short *)AJ;
5685   PetscFunctionBegin;
5686   /* Is this really necessary? */
5687   while (nz--) {
5688     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5689   }
5690   A->ops->setunfactored = PETSC_NULL;
5691   PetscFunctionReturn(0);
5692 }
5693 
5694 
5695