xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision b2b2dd246975d7f9c8a1571def503d28e659d8b1)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1179 {
1180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1181   IS             iscol=a->col,isrow=a->row;
1182   PetscErrorCode ierr;
1183   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1184   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1187   PetscScalar    *x,*b,*t;
1188 
1189   PetscFunctionBegin;
1190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192   t  = a->solve_work;
1193 
1194   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1195   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1196 
1197   /* forward solve the lower triangular */
1198   idx    = 7*(*r++);
1199   t[0] = b[idx];   t[1] = b[1+idx];
1200   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201   t[5] = b[5+idx]; t[6] = b[6+idx];
1202 
1203   for (i=1; i<n; i++) {
1204     v     = aa + 49*ai[i];
1205     vi    = aj + ai[i];
1206     nz    = diag[i] - ai[i];
1207     idx   = 7*(*r++);
1208     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213       x4    = t[3+idx];x5 = t[4+idx];
1214       x6    = t[5+idx];x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idx = 7*i;
1225     t[idx]   = s1;t[1+idx] = s2;
1226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227     t[5+idx] = s6;t[6+idx] = s7;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--){
1231     v    = aa + 49*diag[i] + 49;
1232     vi   = aj + diag[i] + 1;
1233     nz   = ai[i+1] - diag[i] - 1;
1234     idt  = 7*i;
1235     s1 = t[idt];  s2 = t[1+idt];
1236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237     s6 = t[5+idt];s7 = t[6+idt];
1238     while (nz--) {
1239       idx   = 7*(*vi++);
1240       x1    = t[idx];   x2 = t[1+idx];
1241       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242       x6    = t[5+idx]; x7 = t[6+idx];
1243       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1250       v += 49;
1251     }
1252     idc = 7*(*c--);
1253     v   = aa + 49*diag[i];
1254     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1268   }
1269 
1270   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1271   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1281 {
1282   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1283   IS             iscol=a->col,isrow=a->row;
1284   PetscErrorCode ierr;
1285   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1286   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1287   MatScalar      *aa=a->a,*v;
1288   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1289   PetscScalar    *x,*b,*t;
1290 
1291   PetscFunctionBegin;
1292   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1294   t  = a->solve_work;
1295 
1296   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1297   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1298 
1299   /* forward solve the lower triangular */
1300   idx    = 7*r[0];
1301   t[0] = b[idx];   t[1] = b[1+idx];
1302   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1303   t[5] = b[5+idx]; t[6] = b[6+idx];
1304 
1305   for (i=1; i<n; i++) {
1306     v     = aa + 49*ai[i];
1307     vi    = aj + ai[i];
1308     nz    = ai[i+1] - ai[i];
1309     idx   = 7*r[i];
1310     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1311     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1312     for(m=0;m<nz;m++){
1313       idx   = 7*vi[m];
1314       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1315       x4    = t[3+idx];x5 = t[4+idx];
1316       x6    = t[5+idx];x7 = t[6+idx];
1317       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1318       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1319       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1320       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1321       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1322       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1323       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1324       v += 49;
1325     }
1326     idx = 7*i;
1327     t[idx]   = s1;t[1+idx] = s2;
1328     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1329     t[5+idx] = s6;t[6+idx] = s7;
1330   }
1331   /* backward solve the upper triangular */
1332   for (i=n-1; i>=0; i--){
1333     k    = 2*n-i;
1334     v    = aa + 49*ai[k];
1335     vi   = aj + ai[k];
1336     nz   = ai[k+1] - ai[k] - 1;
1337     idt  = 7*i;
1338     s1 = t[idt];  s2 = t[1+idt];
1339     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1340     s6 = t[5+idt];s7 = t[6+idt];
1341     for(m=0;m<nz;m++){
1342       idx   = 7*vi[m];
1343       x1    = t[idx];   x2 = t[1+idx];
1344       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1345       x6    = t[5+idx]; x7 = t[6+idx];
1346       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1347       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1348       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1349       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1350       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1351       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1352       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1353       v += 49;
1354     }
1355     idc = 7*c[i];
1356     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1357                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1358     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1359                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1360     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1361                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1362     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1363                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1364     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1365                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1366     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1367                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1368     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1369                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1370   }
1371 
1372   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1373   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1374   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1383 {
1384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386   PetscErrorCode    ierr;
1387   PetscInt          *diag = a->diag,jdx;
1388   const MatScalar   *aa=a->a,*v;
1389   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390   const PetscScalar *b;
1391 
1392   PetscFunctionBegin;
1393   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1394   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1395   /* forward solve the lower triangular */
1396   idx    = 0;
1397   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1398   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1399   x[6] = b[6+idx];
1400   for (i=1; i<n; i++) {
1401     v     =  aa + 49*ai[i];
1402     vi    =  aj + ai[i];
1403     nz    =  diag[i] - ai[i];
1404     idx   =  7*i;
1405     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407     s7  =  b[6+idx];
1408     while (nz--) {
1409       jdx   = 7*(*vi++);
1410       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1411       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1412       x7    = x[6+jdx];
1413       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1420       v += 49;
1421      }
1422     x[idx]   = s1;
1423     x[1+idx] = s2;
1424     x[2+idx] = s3;
1425     x[3+idx] = s4;
1426     x[4+idx] = s5;
1427     x[5+idx] = s6;
1428     x[6+idx] = s7;
1429   }
1430   /* backward solve the upper triangular */
1431   for (i=n-1; i>=0; i--){
1432     v    = aa + 49*diag[i] + 49;
1433     vi   = aj + diag[i] + 1;
1434     nz   = ai[i+1] - diag[i] - 1;
1435     idt  = 7*i;
1436     s1 = x[idt];   s2 = x[1+idt];
1437     s3 = x[2+idt]; s4 = x[3+idt];
1438     s5 = x[4+idt]; s6 = x[5+idt];
1439     s7 = x[6+idt];
1440     while (nz--) {
1441       idx   = 7*(*vi++);
1442       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1443       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1444       x7    = x[6+idx];
1445       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1452       v += 49;
1453     }
1454     v        = aa + 49*diag[i];
1455     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1469   }
1470 
1471   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1472   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 #undef __FUNCT__
1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480 {
1481     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1482     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483     PetscErrorCode    ierr;
1484     PetscInt          idx,jdx,idt;
1485     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486     const MatScalar   *aa=a->a,*v;
1487     PetscScalar       *x;
1488     const PetscScalar *b;
1489     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490 
1491     PetscFunctionBegin;
1492     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494     /* forward solve the lower triangular */
1495     idx    = 0;
1496     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498     for (i=1; i<n; i++) {
1499        v    = aa + bs2*ai[i];
1500        vi   = aj + ai[i];
1501        nz   = ai[i+1] - ai[i];
1502       idx   = bs*i;
1503        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1505        for(k=0;k<nz;k++) {
1506           jdx   = bs*vi[k];
1507           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516           v   +=  bs2;
1517         }
1518 
1519        x[idx]   = s1;
1520        x[1+idx] = s2;
1521        x[2+idx] = s3;
1522        x[3+idx] = s4;
1523        x[4+idx] = s5;
1524        x[5+idx] = s6;
1525        x[6+idx] = s7;
1526     }
1527 
1528    /* backward solve the upper triangular */
1529   for (i=n-1; i>=0; i--){
1530      v   = aa + bs2*ai[2*n-i];
1531      vi  = aj + ai[2*n-i];
1532      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533      idt = bs*i;
1534      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1536     for(k=0;k<nz;k++) {
1537       idx   = bs*vi[k];
1538        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547         v   +=  bs2;
1548     }
1549     /* x = inv_diagonal*x */
1550     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557   }
1558 
1559   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1567 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1568 {
1569   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1570   IS                iscol=a->col,isrow=a->row;
1571   PetscErrorCode    ierr;
1572   const PetscInt    *r,*c,*rout,*cout;
1573   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1574   const MatScalar   *aa=a->a,*v;
1575   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1576   const PetscScalar *b;
1577   PetscFunctionBegin;
1578   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1579   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1580   t  = a->solve_work;
1581 
1582   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1583   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1584 
1585   /* forward solve the lower triangular */
1586   idx    = 6*(*r++);
1587   t[0] = b[idx];   t[1] = b[1+idx];
1588   t[2] = b[2+idx]; t[3] = b[3+idx];
1589   t[4] = b[4+idx]; t[5] = b[5+idx];
1590   for (i=1; i<n; i++) {
1591     v     = aa + 36*ai[i];
1592     vi    = aj + ai[i];
1593     nz    = diag[i] - ai[i];
1594     idx   = 6*(*r++);
1595     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1596     s5  = b[4+idx]; s6 = b[5+idx];
1597     while (nz--) {
1598       idx   = 6*(*vi++);
1599       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1600       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1601       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1602       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1603       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1604       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1605       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1606       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1607       v += 36;
1608     }
1609     idx = 6*i;
1610     t[idx]   = s1;t[1+idx] = s2;
1611     t[2+idx] = s3;t[3+idx] = s4;
1612     t[4+idx] = s5;t[5+idx] = s6;
1613   }
1614   /* backward solve the upper triangular */
1615   for (i=n-1; i>=0; i--){
1616     v    = aa + 36*diag[i] + 36;
1617     vi   = aj + diag[i] + 1;
1618     nz   = ai[i+1] - diag[i] - 1;
1619     idt  = 6*i;
1620     s1 = t[idt];  s2 = t[1+idt];
1621     s3 = t[2+idt];s4 = t[3+idt];
1622     s5 = t[4+idt];s6 = t[5+idt];
1623     while (nz--) {
1624       idx   = 6*(*vi++);
1625       x1    = t[idx];   x2 = t[1+idx];
1626       x3    = t[2+idx]; x4 = t[3+idx];
1627       x5    = t[4+idx]; x6 = t[5+idx];
1628       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1629       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1630       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1631       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1632       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1633       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1634       v += 36;
1635     }
1636     idc = 6*(*c--);
1637     v   = aa + 36*diag[i];
1638     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1639                                  v[18]*s4+v[24]*s5+v[30]*s6;
1640     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1641                                  v[19]*s4+v[25]*s5+v[31]*s6;
1642     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1643                                  v[20]*s4+v[26]*s5+v[32]*s6;
1644     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1645                                  v[21]*s4+v[27]*s5+v[33]*s6;
1646     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1647                                  v[22]*s4+v[28]*s5+v[34]*s6;
1648     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1649                                  v[23]*s4+v[29]*s5+v[35]*s6;
1650   }
1651 
1652   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1653   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1654   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1655   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1656   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1657   PetscFunctionReturn(0);
1658 }
1659 
1660 #undef __FUNCT__
1661 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1662 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1663 {
1664   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1665   IS                iscol=a->col,isrow=a->row;
1666   PetscErrorCode    ierr;
1667   const PetscInt    *r,*c,*rout,*cout;
1668   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1669   const MatScalar   *aa=a->a,*v;
1670   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1671   const PetscScalar *b;
1672   PetscFunctionBegin;
1673   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1674   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1675   t  = a->solve_work;
1676 
1677   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1678   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1679 
1680   /* forward solve the lower triangular */
1681   idx    = 6*r[0];
1682   t[0] = b[idx];   t[1] = b[1+idx];
1683   t[2] = b[2+idx]; t[3] = b[3+idx];
1684   t[4] = b[4+idx]; t[5] = b[5+idx];
1685   for (i=1; i<n; i++) {
1686     v     = aa + 36*ai[i];
1687     vi    = aj + ai[i];
1688     nz    = ai[i+1] - ai[i];
1689     idx   = 6*r[i];
1690     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1691     s5  = b[4+idx]; s6 = b[5+idx];
1692     for(m=0;m<nz;m++){
1693       idx   = 6*vi[m];
1694       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1695       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1696       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1697       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1698       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1699       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1700       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1701       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1702       v += 36;
1703     }
1704     idx = 6*i;
1705     t[idx]   = s1;t[1+idx] = s2;
1706     t[2+idx] = s3;t[3+idx] = s4;
1707     t[4+idx] = s5;t[5+idx] = s6;
1708   }
1709   /* backward solve the upper triangular */
1710   for (i=n-1; i>=0; i--){
1711     k    = 2*n-i;
1712     v    = aa + 36*ai[k];
1713     vi   = aj + ai[k];
1714     nz   = ai[k+1] - ai[k] - 1;
1715     idt  = 6*i;
1716     s1 = t[idt];  s2 = t[1+idt];
1717     s3 = t[2+idt];s4 = t[3+idt];
1718     s5 = t[4+idt];s6 = t[5+idt];
1719     for(m=0;m<nz;m++){
1720       idx   = 6*vi[m];
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       x5    = t[4+idx]; x6 = t[5+idx];
1724       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1725       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1726       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1727       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1728       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1729       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1730       v += 36;
1731     }
1732     idc = 6*c[i];
1733     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1734                                  v[18]*s4+v[24]*s5+v[30]*s6;
1735     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1736                                  v[19]*s4+v[25]*s5+v[31]*s6;
1737     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1738                                  v[20]*s4+v[26]*s5+v[32]*s6;
1739     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1740                                  v[21]*s4+v[27]*s5+v[33]*s6;
1741     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1742                                  v[22]*s4+v[28]*s5+v[34]*s6;
1743     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1744                                  v[23]*s4+v[29]*s5+v[35]*s6;
1745   }
1746 
1747   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1748   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1749   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1750   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1751   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 
1756 #undef __FUNCT__
1757 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1758 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1759 {
1760   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1761   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1762   PetscErrorCode    ierr;
1763   PetscInt          *diag = a->diag,jdx;
1764   const MatScalar   *aa=a->a,*v;
1765   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1766   const PetscScalar *b;
1767 
1768   PetscFunctionBegin;
1769   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1770   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1771   /* forward solve the lower triangular */
1772   idx    = 0;
1773   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1774   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1775   for (i=1; i<n; i++) {
1776     v     =  aa + 36*ai[i];
1777     vi    =  aj + ai[i];
1778     nz    =  diag[i] - ai[i];
1779     idx   =  6*i;
1780     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1781     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1782     while (nz--) {
1783       jdx   = 6*(*vi++);
1784       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1785       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1786       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1787       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1788       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1789       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1790       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1791       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1792       v += 36;
1793      }
1794     x[idx]   = s1;
1795     x[1+idx] = s2;
1796     x[2+idx] = s3;
1797     x[3+idx] = s4;
1798     x[4+idx] = s5;
1799     x[5+idx] = s6;
1800   }
1801   /* backward solve the upper triangular */
1802   for (i=n-1; i>=0; i--){
1803     v    = aa + 36*diag[i] + 36;
1804     vi   = aj + diag[i] + 1;
1805     nz   = ai[i+1] - diag[i] - 1;
1806     idt  = 6*i;
1807     s1 = x[idt];   s2 = x[1+idt];
1808     s3 = x[2+idt]; s4 = x[3+idt];
1809     s5 = x[4+idt]; s6 = x[5+idt];
1810     while (nz--) {
1811       idx   = 6*(*vi++);
1812       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1813       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1814       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1815       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1816       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1817       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1818       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1819       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1820       v += 36;
1821     }
1822     v        = aa + 36*diag[i];
1823     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1824     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1825     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1826     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1827     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1828     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1829   }
1830 
1831   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1832   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1833   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 #undef __FUNCT__
1838 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1839 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1840 {
1841     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1842     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1843     PetscErrorCode    ierr;
1844     PetscInt          idx,jdx,idt;
1845     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1846     const MatScalar   *aa=a->a,*v;
1847     PetscScalar       *x;
1848     const PetscScalar *b;
1849     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1850 
1851     PetscFunctionBegin;
1852     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1853     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1854     /* forward solve the lower triangular */
1855     idx    = 0;
1856     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1857     x[4] = b[4+idx];x[5] = b[5+idx];
1858     for (i=1; i<n; i++) {
1859        v    = aa + bs2*ai[i];
1860        vi   = aj + ai[i];
1861        nz   = ai[i+1] - ai[i];
1862       idx   = bs*i;
1863        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1864        s5   = b[4+idx];s6 = b[5+idx];
1865        for(k=0;k<nz;k++){
1866           jdx   = bs*vi[k];
1867           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1868 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1869           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1870           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1871           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1872 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1873           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1874 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1875           v   +=  bs2;
1876         }
1877 
1878        x[idx]   = s1;
1879        x[1+idx] = s2;
1880        x[2+idx] = s3;
1881        x[3+idx] = s4;
1882        x[4+idx] = s5;
1883        x[5+idx] = s6;
1884     }
1885 
1886    /* backward solve the upper triangular */
1887   for (i=n-1; i>=0; i--){
1888      v   = aa + bs2*ai[2*n-i];
1889      vi  = aj + ai[2*n-i];
1890      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1891      idt = bs*i;
1892      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1893      s5 = x[4+idt];s6 = x[5+idt];
1894      for(k=0;k<nz;k++){
1895       idx   = bs*vi[k];
1896        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1897        x5    = x[4+idx];x6 = x[5+idx];
1898        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1899        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1900        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1901        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1902        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1903        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1904         v   +=  bs2;
1905     }
1906     /* x = inv_diagonal*x */
1907    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1908    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1909    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1910    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1911    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1912    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1913   }
1914 
1915   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1916   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1917   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 #undef __FUNCT__
1922 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1923 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1924 {
1925   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1926   IS                iscol=a->col,isrow=a->row;
1927   PetscErrorCode    ierr;
1928   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1929   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1930   const MatScalar   *aa=a->a,*v;
1931   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1932   const PetscScalar *b;
1933 
1934   PetscFunctionBegin;
1935   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1936   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1937   t  = a->solve_work;
1938 
1939   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1940   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1941 
1942   /* forward solve the lower triangular */
1943   idx    = 5*(*r++);
1944   t[0] = b[idx];   t[1] = b[1+idx];
1945   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1946   for (i=1; i<n; i++) {
1947     v     = aa + 25*ai[i];
1948     vi    = aj + ai[i];
1949     nz    = diag[i] - ai[i];
1950     idx   = 5*(*r++);
1951     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1952     s5  = b[4+idx];
1953     while (nz--) {
1954       idx   = 5*(*vi++);
1955       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1956       x4    = t[3+idx];x5 = t[4+idx];
1957       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1958       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1959       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1960       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1961       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1962       v += 25;
1963     }
1964     idx = 5*i;
1965     t[idx]   = s1;t[1+idx] = s2;
1966     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1967   }
1968   /* backward solve the upper triangular */
1969   for (i=n-1; i>=0; i--){
1970     v    = aa + 25*diag[i] + 25;
1971     vi   = aj + diag[i] + 1;
1972     nz   = ai[i+1] - diag[i] - 1;
1973     idt  = 5*i;
1974     s1 = t[idt];  s2 = t[1+idt];
1975     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1976     while (nz--) {
1977       idx   = 5*(*vi++);
1978       x1    = t[idx];   x2 = t[1+idx];
1979       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1980       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1981       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1982       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1983       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1984       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1985       v += 25;
1986     }
1987     idc = 5*(*c--);
1988     v   = aa + 25*diag[i];
1989     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1990                                  v[15]*s4+v[20]*s5;
1991     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1992                                  v[16]*s4+v[21]*s5;
1993     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1994                                  v[17]*s4+v[22]*s5;
1995     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1996                                  v[18]*s4+v[23]*s5;
1997     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1998                                  v[19]*s4+v[24]*s5;
1999   }
2000 
2001   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2002   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2003   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2004   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2005   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 #undef __FUNCT__
2010 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2011 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2012 {
2013   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2014   IS                iscol=a->col,isrow=a->row;
2015   PetscErrorCode    ierr;
2016   const PetscInt    *r,*c,*rout,*cout;
2017   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2018   const MatScalar   *aa=a->a,*v;
2019   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2020   const PetscScalar *b;
2021 
2022   PetscFunctionBegin;
2023   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2024   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2025   t  = a->solve_work;
2026 
2027   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2028   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2029 
2030   /* forward solve the lower triangular */
2031   idx    = 5*r[0];
2032   t[0] = b[idx];   t[1] = b[1+idx];
2033   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2034   for (i=1; i<n; i++) {
2035     v     = aa + 25*ai[i];
2036     vi    = aj + ai[i];
2037     nz    = ai[i+1] - ai[i];
2038     idx   = 5*r[i];
2039     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2040     s5  = b[4+idx];
2041     for(m=0;m<nz;m++){
2042       idx   = 5*vi[m];
2043       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2044       x4    = t[3+idx];x5 = t[4+idx];
2045       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2046       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2047       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2048       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2049       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2050       v += 25;
2051     }
2052     idx = 5*i;
2053     t[idx]   = s1;t[1+idx] = s2;
2054     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2055   }
2056   /* backward solve the upper triangular */
2057   for (i=n-1; i>=0; i--){
2058     k    = 2*n-i;
2059     v    = aa + 25*ai[k];
2060     vi   = aj + ai[k];
2061     nz   = ai[k+1] - ai[k] - 1;
2062     idt  = 5*i;
2063     s1 = t[idt];  s2 = t[1+idt];
2064     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2065     for(m=0;m<nz;m++){
2066       idx   = 5*vi[m];
2067       x1    = t[idx];   x2 = t[1+idx];
2068       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2069       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2070       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2071       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2072       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2073       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2074       v += 25;
2075     }
2076     idc = 5*c[i];
2077     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2078                                  v[15]*s4+v[20]*s5;
2079     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2080                                  v[16]*s4+v[21]*s5;
2081     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2082                                  v[17]*s4+v[22]*s5;
2083     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2084                                  v[18]*s4+v[23]*s5;
2085     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2086                                  v[19]*s4+v[24]*s5;
2087   }
2088 
2089   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2090   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2091   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2092   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2093   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2094   PetscFunctionReturn(0);
2095 }
2096 #undef __FUNCT__
2097 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2098 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2099 {
2100   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2101   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2102   PetscErrorCode    ierr;
2103   PetscInt          *diag = a->diag,jdx;
2104   const MatScalar   *aa=a->a,*v;
2105   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2106   const PetscScalar *b;
2107 
2108   PetscFunctionBegin;
2109   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2110   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2111   /* forward solve the lower triangular */
2112   idx    = 0;
2113   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2114   for (i=1; i<n; i++) {
2115     v     =  aa + 25*ai[i];
2116     vi    =  aj + ai[i];
2117     nz    =  diag[i] - ai[i];
2118     idx   =  5*i;
2119     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2120     while (nz--) {
2121       jdx   = 5*(*vi++);
2122       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2123       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2124       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2125       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2126       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2127       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2128       v    += 25;
2129     }
2130     x[idx]   = s1;
2131     x[1+idx] = s2;
2132     x[2+idx] = s3;
2133     x[3+idx] = s4;
2134     x[4+idx] = s5;
2135   }
2136   /* backward solve the upper triangular */
2137   for (i=n-1; i>=0; i--){
2138     v    = aa + 25*diag[i] + 25;
2139     vi   = aj + diag[i] + 1;
2140     nz   = ai[i+1] - diag[i] - 1;
2141     idt  = 5*i;
2142     s1 = x[idt];  s2 = x[1+idt];
2143     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2144     while (nz--) {
2145       idx   = 5*(*vi++);
2146       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2147       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2148       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2149       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2150       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2151       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2152       v    += 25;
2153     }
2154     v        = aa + 25*diag[i];
2155     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2156     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2157     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2158     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2159     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2160   }
2161 
2162   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2164   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2165   PetscFunctionReturn(0);
2166 }
2167 
2168 #undef __FUNCT__
2169 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2170 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2171 {
2172   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2173   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2174   PetscErrorCode    ierr;
2175   PetscInt          jdx;
2176   const MatScalar   *aa=a->a,*v;
2177   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2178   const PetscScalar *b;
2179 
2180   PetscFunctionBegin;
2181   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2182   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2183   /* forward solve the lower triangular */
2184   idx    = 0;
2185   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2186   for (i=1; i<n; i++) {
2187     v   = aa + 25*ai[i];
2188     vi  = aj + ai[i];
2189     nz  = ai[i+1] - ai[i];
2190     idx = 5*i;
2191     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2192     for(k=0;k<nz;k++) {
2193       jdx   = 5*vi[k];
2194       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2195       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2196       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2197       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2198       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2199       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2200       v    += 25;
2201     }
2202     x[idx]   = s1;
2203     x[1+idx] = s2;
2204     x[2+idx] = s3;
2205     x[3+idx] = s4;
2206     x[4+idx] = s5;
2207   }
2208 
2209   /* backward solve the upper triangular */
2210   for (i=n-1; i>=0; i--){
2211     v   = aa + 25*ai[2*n-i];
2212     vi  = aj + ai[2*n-i];
2213     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2214     idt = 5*i;
2215     s1 = x[idt];  s2 = x[1+idt];
2216     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2217     for(k=0;k<nz;k++){
2218       idx   = 5*vi[k];
2219       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2220       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2221       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2222       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2223       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2224       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2225       v    += 25;
2226     }
2227     /* x = inv_diagonal*x */
2228     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2229     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2230     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2231     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2232     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2233   }
2234 
2235   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2236   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2237   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2238   PetscFunctionReturn(0);
2239 }
2240 
2241 #undef __FUNCT__
2242 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2243 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2244 {
2245   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2246   IS                iscol=a->col,isrow=a->row;
2247   PetscErrorCode    ierr;
2248   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2249   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2250   const MatScalar   *aa=a->a,*v;
2251   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2252   const PetscScalar *b;
2253 
2254   PetscFunctionBegin;
2255   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2256   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2257   t  = a->solve_work;
2258 
2259   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2260   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2261 
2262   /* forward solve the lower triangular */
2263   idx    = 4*(*r++);
2264   t[0] = b[idx];   t[1] = b[1+idx];
2265   t[2] = b[2+idx]; t[3] = b[3+idx];
2266   for (i=1; i<n; i++) {
2267     v     = aa + 16*ai[i];
2268     vi    = aj + ai[i];
2269     nz    = diag[i] - ai[i];
2270     idx   = 4*(*r++);
2271     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2272     while (nz--) {
2273       idx   = 4*(*vi++);
2274       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2275       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2276       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2277       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2278       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2279       v    += 16;
2280     }
2281     idx        = 4*i;
2282     t[idx]   = s1;t[1+idx] = s2;
2283     t[2+idx] = s3;t[3+idx] = s4;
2284   }
2285   /* backward solve the upper triangular */
2286   for (i=n-1; i>=0; i--){
2287     v    = aa + 16*diag[i] + 16;
2288     vi   = aj + diag[i] + 1;
2289     nz   = ai[i+1] - diag[i] - 1;
2290     idt  = 4*i;
2291     s1 = t[idt];  s2 = t[1+idt];
2292     s3 = t[2+idt];s4 = t[3+idt];
2293     while (nz--) {
2294       idx   = 4*(*vi++);
2295       x1    = t[idx];   x2 = t[1+idx];
2296       x3    = t[2+idx]; x4 = t[3+idx];
2297       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2298       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2299       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2300       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2301       v += 16;
2302     }
2303     idc      = 4*(*c--);
2304     v        = aa + 16*diag[i];
2305     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2306     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2307     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2308     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2309   }
2310 
2311   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2312   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2313   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2314   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2315   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2316   PetscFunctionReturn(0);
2317 }
2318 
2319 #undef __FUNCT__
2320 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2321 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2322 {
2323   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2324   IS                iscol=a->col,isrow=a->row;
2325   PetscErrorCode    ierr;
2326   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2327   const PetscInt    *r,*c,*rout,*cout;
2328   const MatScalar   *aa=a->a,*v;
2329   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2330   const PetscScalar *b;
2331 
2332   PetscFunctionBegin;
2333   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2334   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2335   t  = a->solve_work;
2336 
2337   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2338   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2339 
2340   /* forward solve the lower triangular */
2341   idx    = 4*r[0];
2342   t[0] = b[idx];   t[1] = b[1+idx];
2343   t[2] = b[2+idx]; t[3] = b[3+idx];
2344   for (i=1; i<n; i++) {
2345     v     = aa + 16*ai[i];
2346     vi    = aj + ai[i];
2347     nz    = ai[i+1] - ai[i];
2348     idx   = 4*r[i];
2349     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2350     for(m=0;m<nz;m++){
2351       idx   = 4*vi[m];
2352       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2353       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2354       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2355       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2356       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2357       v    += 16;
2358     }
2359     idx        = 4*i;
2360     t[idx]   = s1;t[1+idx] = s2;
2361     t[2+idx] = s3;t[3+idx] = s4;
2362   }
2363   /* backward solve the upper triangular */
2364   for (i=n-1; i>=0; i--){
2365     k    = 2*n-i;
2366     v    = aa + 16*ai[k];
2367     vi   = aj + ai[k];
2368     nz   = ai[k+1] - ai[k] - 1;
2369     idt  = 4*i;
2370     s1 = t[idt];  s2 = t[1+idt];
2371     s3 = t[2+idt];s4 = t[3+idt];
2372     for(m=0;m<nz;m++){
2373       idx   = 4*vi[m];
2374       x1    = t[idx];   x2 = t[1+idx];
2375       x3    = t[2+idx]; x4 = t[3+idx];
2376       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2377       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2378       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2379       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2380       v += 16;
2381     }
2382     idc      = 4*c[i];
2383     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2384     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2385     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2386     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2387   }
2388 
2389   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2390   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2391   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2392   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2393   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2394   PetscFunctionReturn(0);
2395 }
2396 
2397 #undef __FUNCT__
2398 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2399 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2400 {
2401   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2402   IS                iscol=a->col,isrow=a->row;
2403   PetscErrorCode    ierr;
2404   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2405   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2406   const MatScalar   *aa=a->a,*v;
2407   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2408   PetscScalar       *x;
2409   const PetscScalar *b;
2410 
2411   PetscFunctionBegin;
2412   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2414   t  = (MatScalar *)a->solve_work;
2415 
2416   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2417   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2418 
2419   /* forward solve the lower triangular */
2420   idx    = 4*(*r++);
2421   t[0] = (MatScalar)b[idx];
2422   t[1] = (MatScalar)b[1+idx];
2423   t[2] = (MatScalar)b[2+idx];
2424   t[3] = (MatScalar)b[3+idx];
2425   for (i=1; i<n; i++) {
2426     v     = aa + 16*ai[i];
2427     vi    = aj + ai[i];
2428     nz    = diag[i] - ai[i];
2429     idx   = 4*(*r++);
2430     s1 = (MatScalar)b[idx];
2431     s2 = (MatScalar)b[1+idx];
2432     s3 = (MatScalar)b[2+idx];
2433     s4 = (MatScalar)b[3+idx];
2434     while (nz--) {
2435       idx   = 4*(*vi++);
2436       x1  = t[idx];
2437       x2  = t[1+idx];
2438       x3  = t[2+idx];
2439       x4  = t[3+idx];
2440       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2441       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2442       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2443       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2444       v    += 16;
2445     }
2446     idx        = 4*i;
2447     t[idx]   = s1;
2448     t[1+idx] = s2;
2449     t[2+idx] = s3;
2450     t[3+idx] = s4;
2451   }
2452   /* backward solve the upper triangular */
2453   for (i=n-1; i>=0; i--){
2454     v    = aa + 16*diag[i] + 16;
2455     vi   = aj + diag[i] + 1;
2456     nz   = ai[i+1] - diag[i] - 1;
2457     idt  = 4*i;
2458     s1 = t[idt];
2459     s2 = t[1+idt];
2460     s3 = t[2+idt];
2461     s4 = t[3+idt];
2462     while (nz--) {
2463       idx   = 4*(*vi++);
2464       x1  = t[idx];
2465       x2  = t[1+idx];
2466       x3  = t[2+idx];
2467       x4  = t[3+idx];
2468       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2469       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2470       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2471       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2472       v += 16;
2473     }
2474     idc      = 4*(*c--);
2475     v        = aa + 16*diag[i];
2476     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2477     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2478     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2479     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2480     x[idc]   = (PetscScalar)t[idt];
2481     x[1+idc] = (PetscScalar)t[1+idt];
2482     x[2+idc] = (PetscScalar)t[2+idt];
2483     x[3+idc] = (PetscScalar)t[3+idt];
2484  }
2485 
2486   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2487   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2488   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2490   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2491   PetscFunctionReturn(0);
2492 }
2493 
2494 #if defined (PETSC_HAVE_SSE)
2495 
2496 #include PETSC_HAVE_SSE
2497 
2498 #undef __FUNCT__
2499 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2500 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2501 {
2502   /*
2503      Note: This code uses demotion of double
2504      to float when performing the mixed-mode computation.
2505      This may not be numerically reasonable for all applications.
2506   */
2507   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2508   IS             iscol=a->col,isrow=a->row;
2509   PetscErrorCode ierr;
2510   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2511   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2512   MatScalar      *aa=a->a,*v;
2513   PetscScalar    *x,*b,*t;
2514 
2515   /* Make space in temp stack for 16 Byte Aligned arrays */
2516   float           ssealignedspace[11],*tmps,*tmpx;
2517   unsigned long   offset;
2518 
2519   PetscFunctionBegin;
2520   SSE_SCOPE_BEGIN;
2521 
2522     offset = (unsigned long)ssealignedspace % 16;
2523     if (offset) offset = (16 - offset)/4;
2524     tmps = &ssealignedspace[offset];
2525     tmpx = &ssealignedspace[offset+4];
2526     PREFETCH_NTA(aa+16*ai[1]);
2527 
2528     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2529     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2530     t  = a->solve_work;
2531 
2532     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2533     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2534 
2535     /* forward solve the lower triangular */
2536     idx  = 4*(*r++);
2537     t[0] = b[idx];   t[1] = b[1+idx];
2538     t[2] = b[2+idx]; t[3] = b[3+idx];
2539     v    =  aa + 16*ai[1];
2540 
2541     for (i=1; i<n;) {
2542       PREFETCH_NTA(&v[8]);
2543       vi   =  aj      + ai[i];
2544       nz   =  diag[i] - ai[i];
2545       idx  =  4*(*r++);
2546 
2547       /* Demote sum from double to float */
2548       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2549       LOAD_PS(tmps,XMM7);
2550 
2551       while (nz--) {
2552         PREFETCH_NTA(&v[16]);
2553         idx = 4*(*vi++);
2554 
2555         /* Demote solution (so far) from double to float */
2556         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2557 
2558         /* 4x4 Matrix-Vector product with negative accumulation: */
2559         SSE_INLINE_BEGIN_2(tmpx,v)
2560           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2561 
2562           /* First Column */
2563           SSE_COPY_PS(XMM0,XMM6)
2564           SSE_SHUFFLE(XMM0,XMM0,0x00)
2565           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2566           SSE_SUB_PS(XMM7,XMM0)
2567 
2568           /* Second Column */
2569           SSE_COPY_PS(XMM1,XMM6)
2570           SSE_SHUFFLE(XMM1,XMM1,0x55)
2571           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2572           SSE_SUB_PS(XMM7,XMM1)
2573 
2574           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2575 
2576           /* Third Column */
2577           SSE_COPY_PS(XMM2,XMM6)
2578           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2579           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2580           SSE_SUB_PS(XMM7,XMM2)
2581 
2582           /* Fourth Column */
2583           SSE_COPY_PS(XMM3,XMM6)
2584           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2585           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2586           SSE_SUB_PS(XMM7,XMM3)
2587         SSE_INLINE_END_2
2588 
2589         v  += 16;
2590       }
2591       idx = 4*i;
2592       v   = aa + 16*ai[++i];
2593       PREFETCH_NTA(v);
2594       STORE_PS(tmps,XMM7);
2595 
2596       /* Promote result from float to double */
2597       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2598     }
2599     /* backward solve the upper triangular */
2600     idt  = 4*(n-1);
2601     ai16 = 16*diag[n-1];
2602     v    = aa + ai16 + 16;
2603     for (i=n-1; i>=0;){
2604       PREFETCH_NTA(&v[8]);
2605       vi = aj + diag[i] + 1;
2606       nz = ai[i+1] - diag[i] - 1;
2607 
2608       /* Demote accumulator from double to float */
2609       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2610       LOAD_PS(tmps,XMM7);
2611 
2612       while (nz--) {
2613         PREFETCH_NTA(&v[16]);
2614         idx = 4*(*vi++);
2615 
2616         /* Demote solution (so far) from double to float */
2617         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2618 
2619         /* 4x4 Matrix-Vector Product with negative accumulation: */
2620         SSE_INLINE_BEGIN_2(tmpx,v)
2621           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2622 
2623           /* First Column */
2624           SSE_COPY_PS(XMM0,XMM6)
2625           SSE_SHUFFLE(XMM0,XMM0,0x00)
2626           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2627           SSE_SUB_PS(XMM7,XMM0)
2628 
2629           /* Second Column */
2630           SSE_COPY_PS(XMM1,XMM6)
2631           SSE_SHUFFLE(XMM1,XMM1,0x55)
2632           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2633           SSE_SUB_PS(XMM7,XMM1)
2634 
2635           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2636 
2637           /* Third Column */
2638           SSE_COPY_PS(XMM2,XMM6)
2639           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2640           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2641           SSE_SUB_PS(XMM7,XMM2)
2642 
2643           /* Fourth Column */
2644           SSE_COPY_PS(XMM3,XMM6)
2645           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2646           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2647           SSE_SUB_PS(XMM7,XMM3)
2648         SSE_INLINE_END_2
2649         v  += 16;
2650       }
2651       v    = aa + ai16;
2652       ai16 = 16*diag[--i];
2653       PREFETCH_NTA(aa+ai16+16);
2654       /*
2655          Scale the result by the diagonal 4x4 block,
2656          which was inverted as part of the factorization
2657       */
2658       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2659         /* First Column */
2660         SSE_COPY_PS(XMM0,XMM7)
2661         SSE_SHUFFLE(XMM0,XMM0,0x00)
2662         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2663 
2664         /* Second Column */
2665         SSE_COPY_PS(XMM1,XMM7)
2666         SSE_SHUFFLE(XMM1,XMM1,0x55)
2667         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2668         SSE_ADD_PS(XMM0,XMM1)
2669 
2670         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2671 
2672         /* Third Column */
2673         SSE_COPY_PS(XMM2,XMM7)
2674         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2675         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2676         SSE_ADD_PS(XMM0,XMM2)
2677 
2678         /* Fourth Column */
2679         SSE_COPY_PS(XMM3,XMM7)
2680         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2681         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2682         SSE_ADD_PS(XMM0,XMM3)
2683 
2684         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2685       SSE_INLINE_END_3
2686 
2687       /* Promote solution from float to double */
2688       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2689 
2690       /* Apply reordering to t and stream into x.    */
2691       /* This way, x doesn't pollute the cache.      */
2692       /* Be careful with size: 2 doubles = 4 floats! */
2693       idc  = 4*(*c--);
2694       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2695         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2696         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2697         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2698         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2699         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2700         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2701       SSE_INLINE_END_2
2702       v    = aa + ai16 + 16;
2703       idt -= 4;
2704     }
2705 
2706     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2707     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2708     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2709     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2710     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2711   SSE_SCOPE_END;
2712   PetscFunctionReturn(0);
2713 }
2714 
2715 #endif
2716 
2717 
2718 /*
2719       Special case where the matrix was ILU(0) factored in the natural
2720    ordering. This eliminates the need for the column and row permutation.
2721 */
2722 #undef __FUNCT__
2723 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2724 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2725 {
2726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2727   PetscInt          n=a->mbs;
2728   const PetscInt    *ai=a->i,*aj=a->j;
2729   PetscErrorCode    ierr;
2730   const PetscInt    *diag = a->diag;
2731   const MatScalar   *aa=a->a;
2732   PetscScalar       *x;
2733   const PetscScalar *b;
2734 
2735   PetscFunctionBegin;
2736   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2737   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2738 
2739 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2740   {
2741     static PetscScalar w[2000]; /* very BAD need to fix */
2742     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2743   }
2744 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2745   {
2746     static PetscScalar w[2000]; /* very BAD need to fix */
2747     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2748   }
2749 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2750   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2751 #else
2752   {
2753     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2754     const MatScalar *v;
2755     PetscInt        jdx,idt,idx,nz,i,ai16;
2756     const PetscInt  *vi;
2757 
2758   /* forward solve the lower triangular */
2759   idx    = 0;
2760   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2761   for (i=1; i<n; i++) {
2762     v     =  aa      + 16*ai[i];
2763     vi    =  aj      + ai[i];
2764     nz    =  diag[i] - ai[i];
2765     idx   +=  4;
2766     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2767     while (nz--) {
2768       jdx   = 4*(*vi++);
2769       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2770       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2771       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2772       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2773       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2774       v    += 16;
2775     }
2776     x[idx]   = s1;
2777     x[1+idx] = s2;
2778     x[2+idx] = s3;
2779     x[3+idx] = s4;
2780   }
2781   /* backward solve the upper triangular */
2782   idt = 4*(n-1);
2783   for (i=n-1; i>=0; i--){
2784     ai16 = 16*diag[i];
2785     v    = aa + ai16 + 16;
2786     vi   = aj + diag[i] + 1;
2787     nz   = ai[i+1] - diag[i] - 1;
2788     s1 = x[idt];  s2 = x[1+idt];
2789     s3 = x[2+idt];s4 = x[3+idt];
2790     while (nz--) {
2791       idx   = 4*(*vi++);
2792       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2793       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2794       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2795       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2796       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2797       v    += 16;
2798     }
2799     v        = aa + ai16;
2800     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2801     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2802     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2803     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2804     idt -= 4;
2805   }
2806   }
2807 #endif
2808 
2809   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2811   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2812   PetscFunctionReturn(0);
2813 }
2814 
2815 #undef __FUNCT__
2816 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2817 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2818 {
2819     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2820     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2821     PetscErrorCode    ierr;
2822     PetscInt          idx,jdx,idt;
2823     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2824     const MatScalar   *aa=a->a,*v;
2825     PetscScalar       *x;
2826     const PetscScalar *b;
2827     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2828 
2829     PetscFunctionBegin;
2830     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2831     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2832     /* forward solve the lower triangular */
2833     idx    = 0;
2834     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2835     for (i=1; i<n; i++) {
2836        v    = aa + bs2*ai[i];
2837        vi   = aj + ai[i];
2838        nz   = ai[i+1] - ai[i];
2839       idx   = bs*i;
2840        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2841       for(k=0;k<nz;k++) {
2842           jdx   = bs*vi[k];
2843           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2844           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2845           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2846           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2847 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2848 
2849           v   +=  bs2;
2850         }
2851 
2852        x[idx]   = s1;
2853        x[1+idx] = s2;
2854        x[2+idx] = s3;
2855        x[3+idx] = s4;
2856     }
2857 
2858    /* backward solve the upper triangular */
2859   for (i=n-1; i>=0; i--){
2860      v   = aa + bs2*ai[2*n-i];
2861      vi  = aj + ai[2*n-i];
2862      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2863      idt = bs*i;
2864      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2865 
2866     for(k=0;k<nz;k++){
2867       idx   = bs*vi[k];
2868        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2869        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2870        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2871        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2872        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2873 
2874         v   +=  bs2;
2875     }
2876     /* x = inv_diagonal*x */
2877    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2878    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2879    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2880    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2881 
2882   }
2883 
2884   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2885   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2886   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2887   PetscFunctionReturn(0);
2888 }
2889 
2890 #undef __FUNCT__
2891 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
2892 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2893 {
2894     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2895     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2896     PetscErrorCode    ierr;
2897     PetscInt          idx,jdx,idt;
2898     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2899     const MatScalar   *aa=a->a,*v;
2900     PetscScalar       *x;
2901     const PetscScalar *b;
2902     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2903 
2904     PetscFunctionBegin;
2905     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2906     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2907     /* forward solve the lower triangular */
2908     idx    = 0;
2909     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2910     for (i=1; i<n; i++) {
2911        v    = aa + bs2*ai[i];
2912        vi   = aj + ai[i];
2913        nz   = ai[i+1] - ai[i];
2914       idx   = bs*i;
2915        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2916       for(k=0;k<nz;k++) {
2917           jdx   = bs*vi[k];
2918           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2919           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2920           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2921           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2922 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2923 
2924           v   +=  bs2;
2925         }
2926 
2927        x[idx]   = s1;
2928        x[1+idx] = s2;
2929        x[2+idx] = s3;
2930        x[3+idx] = s4;
2931     }
2932 
2933    /* backward solve the upper triangular */
2934   for (i=n-1; i>=0; i--){
2935     v   = aa + bs2*(adiag[i+1]+1);
2936      vi  = aj + adiag[i+1]+1;
2937      nz  = adiag[i] - adiag[i+1]-1;
2938      idt = bs*i;
2939      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2940 
2941     for(k=0;k<nz;k++){
2942       idx   = bs*vi[k];
2943        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2944        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2945        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2946        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2947        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2948 
2949         v   +=  bs2;
2950     }
2951     /* x = inv_diagonal*x */
2952    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2953    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2954    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2955    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2956 
2957   }
2958 
2959   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2960   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2961   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2962   PetscFunctionReturn(0);
2963 }
2964 
2965 #undef __FUNCT__
2966 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2967 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2968 {
2969   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2970   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2971   PetscErrorCode ierr;
2972   PetscInt       *diag = a->diag;
2973   MatScalar      *aa=a->a;
2974   PetscScalar    *x,*b;
2975 
2976   PetscFunctionBegin;
2977   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2978   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2979 
2980   {
2981     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2982     MatScalar  *v,*t=(MatScalar *)x;
2983     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2984 
2985     /* forward solve the lower triangular */
2986     idx  = 0;
2987     t[0] = (MatScalar)b[0];
2988     t[1] = (MatScalar)b[1];
2989     t[2] = (MatScalar)b[2];
2990     t[3] = (MatScalar)b[3];
2991     for (i=1; i<n; i++) {
2992       v     =  aa      + 16*ai[i];
2993       vi    =  aj      + ai[i];
2994       nz    =  diag[i] - ai[i];
2995       idx   +=  4;
2996       s1 = (MatScalar)b[idx];
2997       s2 = (MatScalar)b[1+idx];
2998       s3 = (MatScalar)b[2+idx];
2999       s4 = (MatScalar)b[3+idx];
3000       while (nz--) {
3001         jdx = 4*(*vi++);
3002         x1  = t[jdx];
3003         x2  = t[1+jdx];
3004         x3  = t[2+jdx];
3005         x4  = t[3+jdx];
3006         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3007         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3008         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3009         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3010         v    += 16;
3011       }
3012       t[idx]   = s1;
3013       t[1+idx] = s2;
3014       t[2+idx] = s3;
3015       t[3+idx] = s4;
3016     }
3017     /* backward solve the upper triangular */
3018     idt = 4*(n-1);
3019     for (i=n-1; i>=0; i--){
3020       ai16 = 16*diag[i];
3021       v    = aa + ai16 + 16;
3022       vi   = aj + diag[i] + 1;
3023       nz   = ai[i+1] - diag[i] - 1;
3024       s1   = t[idt];
3025       s2   = t[1+idt];
3026       s3   = t[2+idt];
3027       s4   = t[3+idt];
3028       while (nz--) {
3029         idx = 4*(*vi++);
3030         x1  = (MatScalar)x[idx];
3031         x2  = (MatScalar)x[1+idx];
3032         x3  = (MatScalar)x[2+idx];
3033         x4  = (MatScalar)x[3+idx];
3034         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3035         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3036         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3037         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3038         v    += 16;
3039       }
3040       v        = aa + ai16;
3041       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3042       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3043       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3044       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3045       idt -= 4;
3046     }
3047   }
3048 
3049   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3050   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3051   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3052   PetscFunctionReturn(0);
3053 }
3054 
3055 #if defined (PETSC_HAVE_SSE)
3056 
3057 #include PETSC_HAVE_SSE
3058 #undef __FUNCT__
3059 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3060 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3061 {
3062   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3063   unsigned short *aj=(unsigned short *)a->j;
3064   PetscErrorCode ierr;
3065   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3066   MatScalar      *aa=a->a;
3067   PetscScalar    *x,*b;
3068 
3069   PetscFunctionBegin;
3070   SSE_SCOPE_BEGIN;
3071   /*
3072      Note: This code currently uses demotion of double
3073      to float when performing the mixed-mode computation.
3074      This may not be numerically reasonable for all applications.
3075   */
3076   PREFETCH_NTA(aa+16*ai[1]);
3077 
3078   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3079   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3080   {
3081     /* x will first be computed in single precision then promoted inplace to double */
3082     MatScalar      *v,*t=(MatScalar *)x;
3083     int            nz,i,idt,ai16;
3084     unsigned int   jdx,idx;
3085     unsigned short *vi;
3086     /* Forward solve the lower triangular factor. */
3087 
3088     /* First block is the identity. */
3089     idx  = 0;
3090     CONVERT_DOUBLE4_FLOAT4(t,b);
3091     v    =  aa + 16*((unsigned int)ai[1]);
3092 
3093     for (i=1; i<n;) {
3094       PREFETCH_NTA(&v[8]);
3095       vi   =  aj      + ai[i];
3096       nz   =  diag[i] - ai[i];
3097       idx +=  4;
3098 
3099       /* Demote RHS from double to float. */
3100       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3101       LOAD_PS(&t[idx],XMM7);
3102 
3103       while (nz--) {
3104         PREFETCH_NTA(&v[16]);
3105         jdx = 4*((unsigned int)(*vi++));
3106 
3107         /* 4x4 Matrix-Vector product with negative accumulation: */
3108         SSE_INLINE_BEGIN_2(&t[jdx],v)
3109           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3110 
3111           /* First Column */
3112           SSE_COPY_PS(XMM0,XMM6)
3113           SSE_SHUFFLE(XMM0,XMM0,0x00)
3114           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3115           SSE_SUB_PS(XMM7,XMM0)
3116 
3117           /* Second Column */
3118           SSE_COPY_PS(XMM1,XMM6)
3119           SSE_SHUFFLE(XMM1,XMM1,0x55)
3120           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3121           SSE_SUB_PS(XMM7,XMM1)
3122 
3123           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3124 
3125           /* Third Column */
3126           SSE_COPY_PS(XMM2,XMM6)
3127           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3128           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3129           SSE_SUB_PS(XMM7,XMM2)
3130 
3131           /* Fourth Column */
3132           SSE_COPY_PS(XMM3,XMM6)
3133           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3134           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3135           SSE_SUB_PS(XMM7,XMM3)
3136         SSE_INLINE_END_2
3137 
3138         v  += 16;
3139       }
3140       v    =  aa + 16*ai[++i];
3141       PREFETCH_NTA(v);
3142       STORE_PS(&t[idx],XMM7);
3143     }
3144 
3145     /* Backward solve the upper triangular factor.*/
3146 
3147     idt  = 4*(n-1);
3148     ai16 = 16*diag[n-1];
3149     v    = aa + ai16 + 16;
3150     for (i=n-1; i>=0;){
3151       PREFETCH_NTA(&v[8]);
3152       vi = aj + diag[i] + 1;
3153       nz = ai[i+1] - diag[i] - 1;
3154 
3155       LOAD_PS(&t[idt],XMM7);
3156 
3157       while (nz--) {
3158         PREFETCH_NTA(&v[16]);
3159         idx = 4*((unsigned int)(*vi++));
3160 
3161         /* 4x4 Matrix-Vector Product with negative accumulation: */
3162         SSE_INLINE_BEGIN_2(&t[idx],v)
3163           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3164 
3165           /* First Column */
3166           SSE_COPY_PS(XMM0,XMM6)
3167           SSE_SHUFFLE(XMM0,XMM0,0x00)
3168           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3169           SSE_SUB_PS(XMM7,XMM0)
3170 
3171           /* Second Column */
3172           SSE_COPY_PS(XMM1,XMM6)
3173           SSE_SHUFFLE(XMM1,XMM1,0x55)
3174           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3175           SSE_SUB_PS(XMM7,XMM1)
3176 
3177           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3178 
3179           /* Third Column */
3180           SSE_COPY_PS(XMM2,XMM6)
3181           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3182           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3183           SSE_SUB_PS(XMM7,XMM2)
3184 
3185           /* Fourth Column */
3186           SSE_COPY_PS(XMM3,XMM6)
3187           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3188           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3189           SSE_SUB_PS(XMM7,XMM3)
3190         SSE_INLINE_END_2
3191         v  += 16;
3192       }
3193       v    = aa + ai16;
3194       ai16 = 16*diag[--i];
3195       PREFETCH_NTA(aa+ai16+16);
3196       /*
3197          Scale the result by the diagonal 4x4 block,
3198          which was inverted as part of the factorization
3199       */
3200       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3201         /* First Column */
3202         SSE_COPY_PS(XMM0,XMM7)
3203         SSE_SHUFFLE(XMM0,XMM0,0x00)
3204         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3205 
3206         /* Second Column */
3207         SSE_COPY_PS(XMM1,XMM7)
3208         SSE_SHUFFLE(XMM1,XMM1,0x55)
3209         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3210         SSE_ADD_PS(XMM0,XMM1)
3211 
3212         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3213 
3214         /* Third Column */
3215         SSE_COPY_PS(XMM2,XMM7)
3216         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3217         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3218         SSE_ADD_PS(XMM0,XMM2)
3219 
3220         /* Fourth Column */
3221         SSE_COPY_PS(XMM3,XMM7)
3222         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3223         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3224         SSE_ADD_PS(XMM0,XMM3)
3225 
3226         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3227       SSE_INLINE_END_3
3228 
3229       v    = aa + ai16 + 16;
3230       idt -= 4;
3231     }
3232 
3233     /* Convert t from single precision back to double precision (inplace)*/
3234     idt = 4*(n-1);
3235     for (i=n-1;i>=0;i--) {
3236       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3237       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3238       PetscScalar *xtemp=&x[idt];
3239       MatScalar   *ttemp=&t[idt];
3240       xtemp[3] = (PetscScalar)ttemp[3];
3241       xtemp[2] = (PetscScalar)ttemp[2];
3242       xtemp[1] = (PetscScalar)ttemp[1];
3243       xtemp[0] = (PetscScalar)ttemp[0];
3244       idt -= 4;
3245     }
3246 
3247   } /* End of artificial scope. */
3248   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3250   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3251   SSE_SCOPE_END;
3252   PetscFunctionReturn(0);
3253 }
3254 
3255 #undef __FUNCT__
3256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3258 {
3259   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3260   int            *aj=a->j;
3261   PetscErrorCode ierr;
3262   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3263   MatScalar      *aa=a->a;
3264   PetscScalar    *x,*b;
3265 
3266   PetscFunctionBegin;
3267   SSE_SCOPE_BEGIN;
3268   /*
3269      Note: This code currently uses demotion of double
3270      to float when performing the mixed-mode computation.
3271      This may not be numerically reasonable for all applications.
3272   */
3273   PREFETCH_NTA(aa+16*ai[1]);
3274 
3275   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3276   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3277   {
3278     /* x will first be computed in single precision then promoted inplace to double */
3279     MatScalar *v,*t=(MatScalar *)x;
3280     int       nz,i,idt,ai16;
3281     int       jdx,idx;
3282     int       *vi;
3283     /* Forward solve the lower triangular factor. */
3284 
3285     /* First block is the identity. */
3286     idx  = 0;
3287     CONVERT_DOUBLE4_FLOAT4(t,b);
3288     v    =  aa + 16*ai[1];
3289 
3290     for (i=1; i<n;) {
3291       PREFETCH_NTA(&v[8]);
3292       vi   =  aj      + ai[i];
3293       nz   =  diag[i] - ai[i];
3294       idx +=  4;
3295 
3296       /* Demote RHS from double to float. */
3297       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3298       LOAD_PS(&t[idx],XMM7);
3299 
3300       while (nz--) {
3301         PREFETCH_NTA(&v[16]);
3302         jdx = 4*(*vi++);
3303 /*          jdx = *vi++; */
3304 
3305         /* 4x4 Matrix-Vector product with negative accumulation: */
3306         SSE_INLINE_BEGIN_2(&t[jdx],v)
3307           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3308 
3309           /* First Column */
3310           SSE_COPY_PS(XMM0,XMM6)
3311           SSE_SHUFFLE(XMM0,XMM0,0x00)
3312           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3313           SSE_SUB_PS(XMM7,XMM0)
3314 
3315           /* Second Column */
3316           SSE_COPY_PS(XMM1,XMM6)
3317           SSE_SHUFFLE(XMM1,XMM1,0x55)
3318           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3319           SSE_SUB_PS(XMM7,XMM1)
3320 
3321           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3322 
3323           /* Third Column */
3324           SSE_COPY_PS(XMM2,XMM6)
3325           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3326           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3327           SSE_SUB_PS(XMM7,XMM2)
3328 
3329           /* Fourth Column */
3330           SSE_COPY_PS(XMM3,XMM6)
3331           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3332           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3333           SSE_SUB_PS(XMM7,XMM3)
3334         SSE_INLINE_END_2
3335 
3336         v  += 16;
3337       }
3338       v    =  aa + 16*ai[++i];
3339       PREFETCH_NTA(v);
3340       STORE_PS(&t[idx],XMM7);
3341     }
3342 
3343     /* Backward solve the upper triangular factor.*/
3344 
3345     idt  = 4*(n-1);
3346     ai16 = 16*diag[n-1];
3347     v    = aa + ai16 + 16;
3348     for (i=n-1; i>=0;){
3349       PREFETCH_NTA(&v[8]);
3350       vi = aj + diag[i] + 1;
3351       nz = ai[i+1] - diag[i] - 1;
3352 
3353       LOAD_PS(&t[idt],XMM7);
3354 
3355       while (nz--) {
3356         PREFETCH_NTA(&v[16]);
3357         idx = 4*(*vi++);
3358 /*          idx = *vi++; */
3359 
3360         /* 4x4 Matrix-Vector Product with negative accumulation: */
3361         SSE_INLINE_BEGIN_2(&t[idx],v)
3362           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3363 
3364           /* First Column */
3365           SSE_COPY_PS(XMM0,XMM6)
3366           SSE_SHUFFLE(XMM0,XMM0,0x00)
3367           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3368           SSE_SUB_PS(XMM7,XMM0)
3369 
3370           /* Second Column */
3371           SSE_COPY_PS(XMM1,XMM6)
3372           SSE_SHUFFLE(XMM1,XMM1,0x55)
3373           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3374           SSE_SUB_PS(XMM7,XMM1)
3375 
3376           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3377 
3378           /* Third Column */
3379           SSE_COPY_PS(XMM2,XMM6)
3380           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3381           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3382           SSE_SUB_PS(XMM7,XMM2)
3383 
3384           /* Fourth Column */
3385           SSE_COPY_PS(XMM3,XMM6)
3386           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3387           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3388           SSE_SUB_PS(XMM7,XMM3)
3389         SSE_INLINE_END_2
3390         v  += 16;
3391       }
3392       v    = aa + ai16;
3393       ai16 = 16*diag[--i];
3394       PREFETCH_NTA(aa+ai16+16);
3395       /*
3396          Scale the result by the diagonal 4x4 block,
3397          which was inverted as part of the factorization
3398       */
3399       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3400         /* First Column */
3401         SSE_COPY_PS(XMM0,XMM7)
3402         SSE_SHUFFLE(XMM0,XMM0,0x00)
3403         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3404 
3405         /* Second Column */
3406         SSE_COPY_PS(XMM1,XMM7)
3407         SSE_SHUFFLE(XMM1,XMM1,0x55)
3408         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3409         SSE_ADD_PS(XMM0,XMM1)
3410 
3411         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3412 
3413         /* Third Column */
3414         SSE_COPY_PS(XMM2,XMM7)
3415         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3416         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3417         SSE_ADD_PS(XMM0,XMM2)
3418 
3419         /* Fourth Column */
3420         SSE_COPY_PS(XMM3,XMM7)
3421         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3422         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3423         SSE_ADD_PS(XMM0,XMM3)
3424 
3425         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3426       SSE_INLINE_END_3
3427 
3428       v    = aa + ai16 + 16;
3429       idt -= 4;
3430     }
3431 
3432     /* Convert t from single precision back to double precision (inplace)*/
3433     idt = 4*(n-1);
3434     for (i=n-1;i>=0;i--) {
3435       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3436       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3437       PetscScalar *xtemp=&x[idt];
3438       MatScalar   *ttemp=&t[idt];
3439       xtemp[3] = (PetscScalar)ttemp[3];
3440       xtemp[2] = (PetscScalar)ttemp[2];
3441       xtemp[1] = (PetscScalar)ttemp[1];
3442       xtemp[0] = (PetscScalar)ttemp[0];
3443       idt -= 4;
3444     }
3445 
3446   } /* End of artificial scope. */
3447   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3449   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3450   SSE_SCOPE_END;
3451   PetscFunctionReturn(0);
3452 }
3453 
3454 #endif
3455 
3456 #undef __FUNCT__
3457 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3458 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3459 {
3460   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3461   IS                iscol=a->col,isrow=a->row;
3462   PetscErrorCode    ierr;
3463   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3464   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3465   const MatScalar   *aa=a->a,*v;
3466   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3467   const PetscScalar *b;
3468 
3469   PetscFunctionBegin;
3470   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3471   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3472   t  = a->solve_work;
3473 
3474   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3475   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3476 
3477   /* forward solve the lower triangular */
3478   idx    = 3*(*r++);
3479   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3480   for (i=1; i<n; i++) {
3481     v     = aa + 9*ai[i];
3482     vi    = aj + ai[i];
3483     nz    = diag[i] - ai[i];
3484     idx   = 3*(*r++);
3485     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3486     while (nz--) {
3487       idx   = 3*(*vi++);
3488       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3489       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3490       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3491       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3492       v += 9;
3493     }
3494     idx = 3*i;
3495     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3496   }
3497   /* backward solve the upper triangular */
3498   for (i=n-1; i>=0; i--){
3499     v    = aa + 9*diag[i] + 9;
3500     vi   = aj + diag[i] + 1;
3501     nz   = ai[i+1] - diag[i] - 1;
3502     idt  = 3*i;
3503     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3504     while (nz--) {
3505       idx   = 3*(*vi++);
3506       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3507       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3508       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3509       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3510       v += 9;
3511     }
3512     idc = 3*(*c--);
3513     v   = aa + 9*diag[i];
3514     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3515     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3516     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3517   }
3518   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3519   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3520   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3521   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3522   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3523   PetscFunctionReturn(0);
3524 }
3525 
3526 #undef __FUNCT__
3527 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3528 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
3529 {
3530   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3531   IS                iscol=a->col,isrow=a->row;
3532   PetscErrorCode    ierr;
3533   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
3534   const PetscInt    *r,*c,*rout,*cout;
3535   const MatScalar   *aa=a->a,*v;
3536   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3537   const PetscScalar *b;
3538 
3539   PetscFunctionBegin;
3540   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3542   t  = a->solve_work;
3543 
3544   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3545   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3546 
3547   /* forward solve the lower triangular */
3548   idx    = 3*r[0];
3549   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3550   for (i=1; i<n; i++) {
3551     v     = aa + 9*ai[i];
3552     vi    = aj + ai[i];
3553     nz    = ai[i+1] - ai[i];
3554     idx   = 3*r[i];
3555     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3556     for(m=0;m<nz;m++){
3557       idx   = 3*vi[m];
3558       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3559       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3560       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3561       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3562       v += 9;
3563     }
3564     idx = 3*i;
3565     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3566   }
3567   /* backward solve the upper triangular */
3568   for (i=n-1; i>=0; i--){
3569     k    = 2*n-i;
3570     v    = aa + 9*ai[k];
3571     vi   = aj + ai[k];
3572     nz   = ai[k +1] - ai[k] - 1;
3573     idt  = 3*i;
3574     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3575     for(m=0;m<nz;m++){
3576       idx   = 3*vi[m];
3577       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3578       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3579       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3580       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3581       v += 9;
3582     }
3583     idc = 3*c[i];
3584     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3585     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3586     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3587   }
3588   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3589   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3590   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3591   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3592   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3593   PetscFunctionReturn(0);
3594 }
3595 
3596 /*
3597       Special case where the matrix was ILU(0) factored in the natural
3598    ordering. This eliminates the need for the column and row permutation.
3599 */
3600 #undef __FUNCT__
3601 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3602 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
3603 {
3604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3605   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3606   PetscErrorCode    ierr;
3607   PetscInt          *diag = a->diag;
3608   const MatScalar   *aa=a->a,*v;
3609   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3610   const PetscScalar *b;
3611   PetscInt          jdx,idt,idx,nz,*vi,i;
3612 
3613   PetscFunctionBegin;
3614   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3615   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3616 
3617   /* forward solve the lower triangular */
3618   idx    = 0;
3619   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
3620   for (i=1; i<n; i++) {
3621     v     =  aa      + 9*ai[i];
3622     vi    =  aj      + ai[i];
3623     nz    =  diag[i] - ai[i];
3624     idx   +=  3;
3625     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
3626     while (nz--) {
3627       jdx   = 3*(*vi++);
3628       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3629       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3630       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3631       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3632       v    += 9;
3633     }
3634     x[idx]   = s1;
3635     x[1+idx] = s2;
3636     x[2+idx] = s3;
3637   }
3638   /* backward solve the upper triangular */
3639   for (i=n-1; i>=0; i--){
3640     v    = aa + 9*diag[i] + 9;
3641     vi   = aj + diag[i] + 1;
3642     nz   = ai[i+1] - diag[i] - 1;
3643     idt  = 3*i;
3644     s1 = x[idt];  s2 = x[1+idt];
3645     s3 = x[2+idt];
3646     while (nz--) {
3647       idx   = 3*(*vi++);
3648       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3649       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3650       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3651       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3652       v    += 9;
3653     }
3654     v        = aa +  9*diag[i];
3655     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3656     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3657     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3658   }
3659 
3660   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3661   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3662   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3663   PetscFunctionReturn(0);
3664 }
3665 
3666 #undef __FUNCT__
3667 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3668 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3669 {
3670     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3671     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3672     PetscErrorCode    ierr;
3673     PetscInt          idx,jdx,idt;
3674     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3675     const MatScalar   *aa=a->a,*v;
3676     PetscScalar       *x;
3677     const PetscScalar *b;
3678     PetscScalar        s1,s2,s3,x1,x2,x3;
3679 
3680     PetscFunctionBegin;
3681     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3682     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3683     /* forward solve the lower triangular */
3684     idx    = 0;
3685     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3686     for (i=1; i<n; i++) {
3687        v    = aa + bs2*ai[i];
3688        vi   = aj + ai[i];
3689        nz   = ai[i+1] - ai[i];
3690       idx   = bs*i;
3691        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3692       for(k=0;k<nz;k++){
3693          jdx   = bs*vi[k];
3694           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3695           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3696           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3697           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3698 
3699           v   +=  bs2;
3700         }
3701 
3702        x[idx]   = s1;
3703        x[1+idx] = s2;
3704        x[2+idx] = s3;
3705     }
3706 
3707    /* backward solve the upper triangular */
3708   for (i=n-1; i>=0; i--){
3709      v   = aa + bs2*ai[2*n-i];
3710      vi  = aj + ai[2*n-i];
3711      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3712      idt = bs*i;
3713      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3714 
3715      for(k=0;k<nz;k++){
3716        idx   = bs*vi[k];
3717        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3718        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3719        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3720        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3721 
3722         v   +=  bs2;
3723     }
3724     /* x = inv_diagonal*x */
3725    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3726    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3727    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3728 
3729   }
3730 
3731   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3732   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3733   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3734   PetscFunctionReturn(0);
3735 }
3736 
3737 #undef __FUNCT__
3738 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
3739 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3740 {
3741     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3742     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3743     PetscErrorCode    ierr;
3744     PetscInt          idx,jdx,idt;
3745     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3746     const MatScalar   *aa=a->a,*v;
3747     PetscScalar       *x;
3748     const PetscScalar *b;
3749     PetscScalar        s1,s2,s3,x1,x2,x3;
3750 
3751     PetscFunctionBegin;
3752     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3753     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3754     /* forward solve the lower triangular */
3755     idx    = 0;
3756     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3757     for (i=1; i<n; i++) {
3758        v    = aa + bs2*ai[i];
3759        vi   = aj + ai[i];
3760        nz   = ai[i+1] - ai[i];
3761       idx   = bs*i;
3762        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3763       for(k=0;k<nz;k++){
3764          jdx   = bs*vi[k];
3765           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3766           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3767           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3768           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3769 
3770           v   +=  bs2;
3771         }
3772 
3773        x[idx]   = s1;
3774        x[1+idx] = s2;
3775        x[2+idx] = s3;
3776     }
3777 
3778    /* backward solve the upper triangular */
3779   for (i=n-1; i>=0; i--){
3780     v   = aa + bs2*(adiag[i+1]+1);
3781      vi  = aj + adiag[i+1]+1;
3782      nz  = adiag[i] - adiag[i+1]-1;
3783      idt = bs*i;
3784      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3785 
3786      for(k=0;k<nz;k++){
3787        idx   = bs*vi[k];
3788        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3789        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3790        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3791        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3792 
3793         v   +=  bs2;
3794     }
3795     /* x = inv_diagonal*x */
3796    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3797    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3798    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3799 
3800   }
3801 
3802   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3803   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3804   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3805   PetscFunctionReturn(0);
3806 }
3807 
3808 #undef __FUNCT__
3809 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3810 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
3811 {
3812   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3813   IS                iscol=a->col,isrow=a->row;
3814   PetscErrorCode    ierr;
3815   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3816   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3817   const MatScalar   *aa=a->a,*v;
3818   PetscScalar       *x,s1,s2,x1,x2,*t;
3819   const PetscScalar *b;
3820 
3821   PetscFunctionBegin;
3822   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3823   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3824   t  = a->solve_work;
3825 
3826   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3827   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3828 
3829   /* forward solve the lower triangular */
3830   idx    = 2*(*r++);
3831   t[0] = b[idx]; t[1] = b[1+idx];
3832   for (i=1; i<n; i++) {
3833     v     = aa + 4*ai[i];
3834     vi    = aj + ai[i];
3835     nz    = diag[i] - ai[i];
3836     idx   = 2*(*r++);
3837     s1  = b[idx]; s2 = b[1+idx];
3838     while (nz--) {
3839       idx   = 2*(*vi++);
3840       x1    = t[idx]; x2 = t[1+idx];
3841       s1 -= v[0]*x1 + v[2]*x2;
3842       s2 -= v[1]*x1 + v[3]*x2;
3843       v += 4;
3844     }
3845     idx = 2*i;
3846     t[idx] = s1; t[1+idx] = s2;
3847   }
3848   /* backward solve the upper triangular */
3849   for (i=n-1; i>=0; i--){
3850     v    = aa + 4*diag[i] + 4;
3851     vi   = aj + diag[i] + 1;
3852     nz   = ai[i+1] - diag[i] - 1;
3853     idt  = 2*i;
3854     s1 = t[idt]; s2 = t[1+idt];
3855     while (nz--) {
3856       idx   = 2*(*vi++);
3857       x1    = t[idx]; x2 = t[1+idx];
3858       s1 -= v[0]*x1 + v[2]*x2;
3859       s2 -= v[1]*x1 + v[3]*x2;
3860       v += 4;
3861     }
3862     idc = 2*(*c--);
3863     v   = aa + 4*diag[i];
3864     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3865     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3866   }
3867   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3868   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3869   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3870   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3871   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3872   PetscFunctionReturn(0);
3873 }
3874 
3875 #undef __FUNCT__
3876 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
3877 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
3878 {
3879   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3880   IS                iscol=a->col,isrow=a->row;
3881   PetscErrorCode    ierr;
3882   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
3883   const PetscInt    *r,*c,*rout,*cout;
3884   const MatScalar   *aa=a->a,*v;
3885   PetscScalar       *x,s1,s2,x1,x2,*t;
3886   const PetscScalar *b;
3887 
3888   PetscFunctionBegin;
3889   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3890   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3891   t  = a->solve_work;
3892 
3893   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3894   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3895 
3896   /* forward solve the lower triangular */
3897   idx    = 2*r[0];
3898   t[0] = b[idx]; t[1] = b[1+idx];
3899   for (i=1; i<n; i++) {
3900     v     = aa + 4*ai[i];
3901     vi    = aj + ai[i];
3902     nz    = ai[i+1] - ai[i];
3903     idx   = 2*r[i];
3904     s1  = b[idx]; s2 = b[1+idx];
3905     for(m=0;m<nz;m++){
3906       jdx   = 2*vi[m];
3907       x1    = t[jdx]; x2 = t[1+jdx];
3908       s1 -= v[0]*x1 + v[2]*x2;
3909       s2 -= v[1]*x1 + v[3]*x2;
3910       v += 4;
3911     }
3912     idx = 2*i;
3913     t[idx] = s1; t[1+idx] = s2;
3914   }
3915   /* backward solve the upper triangular */
3916   for (i=n-1; i>=0; i--){
3917     k = 2*n-i;
3918     v    = aa + 4*ai[k];
3919     vi   = aj + ai[k];
3920     nz   = ai[k +1] - ai[k] - 1;
3921     idt  = 2*i;
3922     s1 = t[idt]; s2 = t[1+idt];
3923     for(m=0;m<nz;m++){
3924       idx   = 2*vi[m];
3925       x1    = t[idx]; x2 = t[1+idx];
3926       s1 -= v[0]*x1 + v[2]*x2;
3927       s2 -= v[1]*x1 + v[3]*x2;
3928       v += 4;
3929     }
3930     idc = 2*c[i];
3931     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3932     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3933   }
3934   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3935   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3936   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3937   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3938   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3939   PetscFunctionReturn(0);
3940 }
3941 
3942 
3943 /*
3944       Special case where the matrix was ILU(0) factored in the natural
3945    ordering. This eliminates the need for the column and row permutation.
3946 */
3947 #undef __FUNCT__
3948 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3949 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
3950 {
3951   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3952   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3953   PetscErrorCode    ierr;
3954   PetscInt          *diag = a->diag;
3955   const MatScalar   *aa=a->a,*v;
3956   PetscScalar       *x,s1,s2,x1,x2;
3957   const PetscScalar *b;
3958   PetscInt          jdx,idt,idx,nz,*vi,i;
3959 
3960   PetscFunctionBegin;
3961   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3962   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3963 
3964   /* forward solve the lower triangular */
3965   idx    = 0;
3966   x[0]   = b[0]; x[1] = b[1];
3967   for (i=1; i<n; i++) {
3968     v     =  aa      + 4*ai[i];
3969     vi    =  aj      + ai[i];
3970     nz    =  diag[i] - ai[i];
3971     idx   +=  2;
3972     s1  =  b[idx];s2 = b[1+idx];
3973     while (nz--) {
3974       jdx   = 2*(*vi++);
3975       x1    = x[jdx];x2 = x[1+jdx];
3976       s1 -= v[0]*x1 + v[2]*x2;
3977       s2 -= v[1]*x1 + v[3]*x2;
3978       v    += 4;
3979     }
3980     x[idx]   = s1;
3981     x[1+idx] = s2;
3982   }
3983   /* backward solve the upper triangular */
3984   for (i=n-1; i>=0; i--){
3985     v    = aa + 4*diag[i] + 4;
3986     vi   = aj + diag[i] + 1;
3987     nz   = ai[i+1] - diag[i] - 1;
3988     idt  = 2*i;
3989     s1 = x[idt];  s2 = x[1+idt];
3990     while (nz--) {
3991       idx   = 2*(*vi++);
3992       x1    = x[idx];   x2 = x[1+idx];
3993       s1 -= v[0]*x1 + v[2]*x2;
3994       s2 -= v[1]*x1 + v[3]*x2;
3995       v    += 4;
3996     }
3997     v        = aa +  4*diag[i];
3998     x[idt]   = v[0]*s1 + v[2]*s2;
3999     x[1+idt] = v[1]*s1 + v[3]*s2;
4000   }
4001 
4002   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4003   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4004   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4005   PetscFunctionReturn(0);
4006 }
4007 
4008 #undef __FUNCT__
4009 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4010 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4011 {
4012     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4013     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4014     PetscErrorCode    ierr;
4015     PetscInt          jdx;
4016     const MatScalar   *aa=a->a,*v;
4017     PetscScalar       *x,s1,s2,x1,x2;
4018     const PetscScalar *b;
4019 
4020     PetscFunctionBegin;
4021     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4022     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4023     /* forward solve the lower triangular */
4024     idx    = 0;
4025     x[0] = b[idx]; x[1] = b[1+idx];
4026     for (i=1; i<n; i++) {
4027         v   = aa + 4*ai[i];
4028        vi   = aj + ai[i];
4029        nz   = ai[i+1] - ai[i];
4030        idx  = 2*i;
4031        s1   = b[idx];s2 = b[1+idx];
4032       for(k=0;k<nz;k++){
4033          jdx   = 2*vi[k];
4034           x1    = x[jdx];x2 = x[1+jdx];
4035           s1   -= v[0]*x1 + v[2]*x2;
4036           s2   -= v[1]*x1 + v[3]*x2;
4037            v   +=  4;
4038         }
4039        x[idx]   = s1;
4040        x[1+idx] = s2;
4041     }
4042 
4043    /* backward solve the upper triangular */
4044   for (i=n-1; i>=0; i--){
4045      v   = aa + 4*ai[2*n-i];
4046      vi  = aj + ai[2*n-i];
4047      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4048      idt = 2*i;
4049      s1 = x[idt];  s2 = x[1+idt];
4050      for(k=0;k<nz;k++){
4051       idx   = 2*vi[k];
4052        x1    = x[idx];   x2 = x[1+idx];
4053        s1 -= v[0]*x1 + v[2]*x2;
4054        s2 -= v[1]*x1 + v[3]*x2;
4055          v    += 4;
4056     }
4057     /* x = inv_diagonal*x */
4058    x[idt]   = v[0]*s1 + v[2]*s2;
4059    x[1+idt] = v[1]*s1 + v[3]*s2;
4060   }
4061 
4062   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4063   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4064   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4065   PetscFunctionReturn(0);
4066 }
4067 
4068 #undef __FUNCT__
4069 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4070 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4071 {
4072     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4073     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4074     PetscErrorCode    ierr;
4075     PetscInt          jdx;
4076     const MatScalar   *aa=a->a,*v;
4077     PetscScalar       *x,s1,s2,x1,x2;
4078     const PetscScalar *b;
4079 
4080     PetscFunctionBegin;
4081     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4082     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4083     /* forward solve the lower triangular */
4084     idx    = 0;
4085     x[0] = b[idx]; x[1] = b[1+idx];
4086     for (i=1; i<n; i++) {
4087         v   = aa + 4*ai[i];
4088        vi   = aj + ai[i];
4089        nz   = ai[i+1] - ai[i];
4090        idx  = 2*i;
4091        s1   = b[idx];s2 = b[1+idx];
4092       for(k=0;k<nz;k++){
4093          jdx   = 2*vi[k];
4094           x1    = x[jdx];x2 = x[1+jdx];
4095           s1   -= v[0]*x1 + v[2]*x2;
4096           s2   -= v[1]*x1 + v[3]*x2;
4097            v   +=  4;
4098         }
4099        x[idx]   = s1;
4100        x[1+idx] = s2;
4101     }
4102 
4103    /* backward solve the upper triangular */
4104   for (i=n-1; i>=0; i--){
4105      v   = aa + 4*(adiag[i+1]+1);
4106      vi  = aj + adiag[i+1]+1;
4107      nz  = adiag[i] - adiag[i+1]-1;
4108      idt = 2*i;
4109      s1 = x[idt];  s2 = x[1+idt];
4110      for(k=0;k<nz;k++){
4111       idx   = 2*vi[k];
4112        x1    = x[idx];   x2 = x[1+idx];
4113        s1 -= v[0]*x1 + v[2]*x2;
4114        s2 -= v[1]*x1 + v[3]*x2;
4115          v    += 4;
4116     }
4117     /* x = inv_diagonal*x */
4118    x[idt]   = v[0]*s1 + v[2]*s2;
4119    x[1+idt] = v[1]*s1 + v[3]*s2;
4120   }
4121 
4122   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4123   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4124   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4125   PetscFunctionReturn(0);
4126 }
4127 
4128 #undef __FUNCT__
4129 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4130 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4131 {
4132   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4133   IS             iscol=a->col,isrow=a->row;
4134   PetscErrorCode ierr;
4135   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4136   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4137   MatScalar      *aa=a->a,*v;
4138   PetscScalar    *x,*b,s1,*t;
4139 
4140   PetscFunctionBegin;
4141   if (!n) PetscFunctionReturn(0);
4142 
4143   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4144   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4145   t  = a->solve_work;
4146 
4147   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4148   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4149 
4150   /* forward solve the lower triangular */
4151   t[0] = b[*r++];
4152   for (i=1; i<n; i++) {
4153     v     = aa + ai[i];
4154     vi    = aj + ai[i];
4155     nz    = diag[i] - ai[i];
4156     s1  = b[*r++];
4157     while (nz--) {
4158       s1 -= (*v++)*t[*vi++];
4159     }
4160     t[i] = s1;
4161   }
4162   /* backward solve the upper triangular */
4163   for (i=n-1; i>=0; i--){
4164     v    = aa + diag[i] + 1;
4165     vi   = aj + diag[i] + 1;
4166     nz   = ai[i+1] - diag[i] - 1;
4167     s1 = t[i];
4168     while (nz--) {
4169       s1 -= (*v++)*t[*vi++];
4170     }
4171     x[*c--] = t[i] = aa[diag[i]]*s1;
4172   }
4173 
4174   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4175   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4176   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4177   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4178   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4179   PetscFunctionReturn(0);
4180 }
4181 /*
4182       Special case where the matrix was ILU(0) factored in the natural
4183    ordering. This eliminates the need for the column and row permutation.
4184 */
4185 #undef __FUNCT__
4186 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4187 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4188 {
4189   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4190   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4191   PetscErrorCode ierr;
4192   PetscInt       *diag = a->diag;
4193   MatScalar      *aa=a->a;
4194   PetscScalar    *x,*b;
4195   PetscScalar    s1,x1;
4196   MatScalar      *v;
4197   PetscInt       jdx,idt,idx,nz,*vi,i;
4198 
4199   PetscFunctionBegin;
4200   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4202 
4203   /* forward solve the lower triangular */
4204   idx    = 0;
4205   x[0]   = b[0];
4206   for (i=1; i<n; i++) {
4207     v     =  aa      + ai[i];
4208     vi    =  aj      + ai[i];
4209     nz    =  diag[i] - ai[i];
4210     idx   +=  1;
4211     s1  =  b[idx];
4212     while (nz--) {
4213       jdx   = *vi++;
4214       x1    = x[jdx];
4215       s1 -= v[0]*x1;
4216       v    += 1;
4217     }
4218     x[idx]   = s1;
4219   }
4220   /* backward solve the upper triangular */
4221   for (i=n-1; i>=0; i--){
4222     v    = aa + diag[i] + 1;
4223     vi   = aj + diag[i] + 1;
4224     nz   = ai[i+1] - diag[i] - 1;
4225     idt  = i;
4226     s1 = x[idt];
4227     while (nz--) {
4228       idx   = *vi++;
4229       x1    = x[idx];
4230       s1 -= v[0]*x1;
4231       v    += 1;
4232     }
4233     v        = aa +  diag[i];
4234     x[idt]   = v[0]*s1;
4235   }
4236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4238   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4239   PetscFunctionReturn(0);
4240 }
4241 
4242 /* ----------------------------------------------------------------*/
4243 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4244 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4245 
4246 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
4247 extern PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat,Vec,Vec);
4248 
4249 #undef __FUNCT__
4250 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4251 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4252 {
4253   Mat            C=B;
4254   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4255   IS             isrow = b->row,isicol = b->icol;
4256   PetscErrorCode ierr;
4257   const PetscInt *r,*ic,*ics;
4258   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4259   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4260   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4261   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4262   MatScalar      *v_work;
4263 
4264   PetscFunctionBegin;
4265   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4266   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4267   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4268   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
4269   ics  = ic;
4270 
4271   /* generate work space needed by dense LU factorization */
4272   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4273   mwork    = v_work + bs;
4274   v_pivots = (PetscInt*)(mwork + bs2);
4275 
4276   for (i=0; i<n; i++){
4277     /* zero rtmp */
4278     /* L part */
4279     nz    = bi[i+1] - bi[i];
4280     bjtmp = bj + bi[i];
4281     for  (j=0; j<nz; j++){
4282       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4283     }
4284 
4285     /* U part */
4286     nz = bi[2*n-i+1] - bi[2*n-i];
4287     bjtmp = bj + bi[2*n-i];
4288     for  (j=0; j<nz; j++){
4289       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4290     }
4291 
4292     /* load in initial (unfactored row) */
4293     nz    = ai[r[i]+1] - ai[r[i]];
4294     ajtmp = aj + ai[r[i]];
4295     v     = aa + bs2*ai[r[i]];
4296     for (j=0; j<nz; j++) {
4297       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4298     }
4299 
4300     /* elimination */
4301     bjtmp = bj + bi[i];
4302     nzL   = bi[i+1] - bi[i];
4303     for(k=0;k < nzL;k++) {
4304       row = bjtmp[k];
4305       pc = rtmp + bs2*row;
4306       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4307       if (flg) {
4308         pv         = b->a + bs2*bdiag[row];
4309         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4310         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4311         pv         = b->a + bs2*bi[2*n-row];
4312         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4313         for (j=0; j<nz; j++) {
4314           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4315         }
4316         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4317       }
4318     }
4319 
4320     /* finished row so stick it into b->a */
4321     /* L part */
4322     pv   = b->a + bs2*bi[i] ;
4323     pj   = b->j + bi[i] ;
4324     nz   = bi[i+1] - bi[i];
4325     for (j=0; j<nz; j++) {
4326       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4327     }
4328 
4329     /* Mark diagonal and invert diagonal for simplier triangular solves */
4330     pv  = b->a + bs2*bdiag[i];
4331     pj  = b->j + bdiag[i];
4332     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4333     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4334     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4335 
4336     /* U part */
4337     pv = b->a + bs2*bi[2*n-i];
4338     pj = b->j + bi[2*n-i];
4339     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4340     for (j=0; j<nz; j++){
4341       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4342     }
4343   }
4344 
4345   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4346   ierr = PetscFree(v_work);CHKERRQ(ierr);
4347   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4348   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4349 
4350   C->assembled = PETSC_TRUE;
4351   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4352   PetscFunctionReturn(0);
4353 }
4354 
4355 /*
4356    ilu(0) with natural ordering under new data structure.
4357    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
4358    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
4359 */
4360 #undef __FUNCT__
4361 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
4362 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4363 {
4364 
4365   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4366   PetscErrorCode     ierr;
4367   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
4368   PetscInt           i,j,nz,*bi,*bj,*bdiag;
4369 
4370   PetscFunctionBegin;
4371   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
4372   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
4373   b    = (Mat_SeqBAIJ*)(fact)->data;
4374 
4375   /* allocate matrix arrays for new data structure */
4376   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
4377   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
4378   b->singlemalloc = PETSC_TRUE;
4379   if (!b->diag){
4380     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
4381   }
4382   bdiag = b->diag;
4383 
4384   if (n > 0) {
4385     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
4386   }
4387 
4388   /* set bi and bj with new data structure */
4389   bi = b->i;
4390   bj = b->j;
4391 
4392   /* L part */
4393   bi[0] = 0;
4394   for (i=0; i<n; i++){
4395     nz = adiag[i] - ai[i];
4396     bi[i+1] = bi[i] + nz;
4397     aj = a->j + ai[i];
4398     for (j=0; j<nz; j++){
4399       *bj = aj[j]; bj++;
4400     }
4401   }
4402 
4403   /* U part */
4404   bi[n+1] = bi[n];
4405   for (i=n-1; i>=0; i--){
4406     nz = ai[i+1] - adiag[i] - 1;
4407     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
4408     aj = a->j + adiag[i] + 1;
4409     for (j=0; j<nz; j++){
4410       *bj = aj[j]; bj++;
4411     }
4412     /* diag[i] */
4413     *bj = i; bj++;
4414     bdiag[i] = bi[2*n-i+1]-1;
4415   }
4416   PetscFunctionReturn(0);
4417 }
4418 
4419 #undef __FUNCT__
4420 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
4421 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4422 {
4423   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4424   IS                 isicol;
4425   PetscErrorCode     ierr;
4426   const PetscInt     *r,*ic;
4427   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
4428   PetscInt           *bi,*cols,nnz,*cols_lvl;
4429   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
4430   PetscInt           i,levels,diagonal_fill;
4431   PetscTruth         col_identity,row_identity,both_identity;
4432   PetscReal          f;
4433   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
4434   PetscBT            lnkbt;
4435   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
4436   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
4437   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
4438   PetscTruth         missing;
4439   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
4440 
4441   PetscFunctionBegin;
4442   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
4443   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
4444   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
4445 
4446   f             = info->fill;
4447   levels        = (PetscInt)info->levels;
4448   diagonal_fill = (PetscInt)info->diagonal_fill;
4449   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4450 
4451   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4452   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4453   both_identity = (PetscTruth) (row_identity && col_identity);
4454 
4455   if (!levels && both_identity) {
4456     /* special case: ilu(0) with natural ordering */
4457     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4458     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
4459     /* set MatSolve routines */
4460     switch (bs){
4461     case 2:
4462       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
4463       break;
4464     case 3:
4465       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
4466       break;
4467     case 4:
4468       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
4469       break;
4470     case 5:
4471       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
4472       break;
4473     case 6:
4474       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
4475       break;
4476     case 7:
4477       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
4478       break;
4479     default:
4480       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4481       break;
4482     }
4483 
4484     fact->factor = MAT_FACTOR_ILU;
4485     (fact)->info.factor_mallocs    = 0;
4486     (fact)->info.fill_ratio_given  = info->fill;
4487     (fact)->info.fill_ratio_needed = 1.0;
4488     b                = (Mat_SeqBAIJ*)(fact)->data;
4489     b->row           = isrow;
4490     b->col           = iscol;
4491     b->icol          = isicol;
4492     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4493     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4494     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4495     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4496     PetscFunctionReturn(0);
4497   }
4498 
4499   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4500   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4501 
4502   /* get new row pointers */
4503   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
4504   bi[0] = 0;
4505   /* bdiag is location of diagonal in factor */
4506   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
4507   bdiag[0]  = 0;
4508 
4509   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
4510   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
4511 
4512   /* create a linked list for storing column indices of the active row */
4513   nlnk = n + 1;
4514   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4515 
4516   /* initial FreeSpace size is f*(ai[n]+1) */
4517   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
4518   current_space = free_space;
4519   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
4520   current_space_lvl = free_space_lvl;
4521 
4522   for (i=0; i<n; i++) {
4523     nzi = 0;
4524     /* copy current row into linked list */
4525     nnz  = ai[r[i]+1] - ai[r[i]];
4526     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
4527     cols = aj + ai[r[i]];
4528     lnk[i] = -1; /* marker to indicate if diagonal exists */
4529     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4530     nzi += nlnk;
4531 
4532     /* make sure diagonal entry is included */
4533     if (diagonal_fill && lnk[i] == -1) {
4534       fm = n;
4535       while (lnk[fm] < i) fm = lnk[fm];
4536       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
4537       lnk[fm]    = i;
4538       lnk_lvl[i] = 0;
4539       nzi++; dcount++;
4540     }
4541 
4542     /* add pivot rows into the active row */
4543     nzbd = 0;
4544     prow = lnk[n];
4545     while (prow < i) {
4546       nnz      = bdiag[prow];
4547       cols     = bj_ptr[prow] + nnz + 1;
4548       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
4549       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
4550       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
4551       nzi += nlnk;
4552       prow = lnk[prow];
4553       nzbd++;
4554     }
4555     bdiag[i] = nzbd;
4556     bi[i+1]  = bi[i] + nzi;
4557 
4558     /* if free space is not available, make more free space */
4559     if (current_space->local_remaining<nzi) {
4560       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
4561       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
4562       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
4563       reallocs++;
4564     }
4565 
4566     /* copy data into free_space and free_space_lvl, then initialize lnk */
4567     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
4568     bj_ptr[i]    = current_space->array;
4569     bjlvl_ptr[i] = current_space_lvl->array;
4570 
4571     /* make sure the active row i has diagonal entry */
4572     if (*(bj_ptr[i]+bdiag[i]) != i) {
4573       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4574     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
4575     }
4576 
4577     current_space->array           += nzi;
4578     current_space->local_used      += nzi;
4579     current_space->local_remaining -= nzi;
4580     current_space_lvl->array           += nzi;
4581     current_space_lvl->local_used      += nzi;
4582     current_space_lvl->local_remaining -= nzi;
4583   }
4584 
4585   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4586   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4587 
4588   /* destroy list of free space and other temporary arrays */
4589   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
4590 
4591   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
4592   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
4593 
4594   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
4595   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
4596   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
4597 
4598 #if defined(PETSC_USE_INFO)
4599   {
4600     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
4601     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
4602     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4603     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
4604     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4605     if (diagonal_fill) {
4606       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
4607     }
4608   }
4609 #endif
4610 
4611   /* put together the new matrix */
4612   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4613   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4614   b = (Mat_SeqBAIJ*)(fact)->data;
4615   b->free_a       = PETSC_TRUE;
4616   b->free_ij      = PETSC_TRUE;
4617   b->singlemalloc = PETSC_FALSE;
4618   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4619   b->j          = bj;
4620   b->i          = bi;
4621   b->diag       = bdiag;
4622   b->free_diag  = PETSC_TRUE;
4623   b->ilen       = 0;
4624   b->imax       = 0;
4625   b->row        = isrow;
4626   b->col        = iscol;
4627   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4628   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4629   b->icol       = isicol;
4630   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4631   /* In b structure:  Free imax, ilen, old a, old j.
4632      Allocate bdiag, solve_work, new a, new j */
4633   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
4634   b->maxnz = b->nz = bi[2*n+1] ;
4635   (fact)->info.factor_mallocs    = reallocs;
4636   (fact)->info.fill_ratio_given  = f;
4637   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
4638   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
4639   /* set MatSolve routines */
4640   if (both_identity){
4641     switch (bs){
4642     case 2:
4643       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
4644       break;
4645     case 3:
4646       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
4647       break;
4648     case 4:
4649       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
4650       break;
4651     case 5:
4652       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
4653       break;
4654     case 6:
4655       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
4656       break;
4657     case 7:
4658       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
4659       break;
4660     default:
4661       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4662       break;
4663     }
4664   } else {
4665     switch (bs){
4666     case 2:
4667       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
4668       break;
4669     case 3:
4670       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
4671       break;
4672     case 4:
4673       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
4674       break;
4675     case 5:
4676       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
4677       break;
4678     case 6:
4679       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
4680       break;
4681     case 7:
4682       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
4683       break;
4684     default:
4685       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
4686       break;
4687     }
4688   }
4689   PetscFunctionReturn(0);
4690 }
4691 
4692 /*
4693      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
4694    except that the data structure of Mat_SeqAIJ is slightly different.
4695    Not a good example of code reuse.
4696 */
4697 #undef __FUNCT__
4698 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
4699 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4700 {
4701   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
4702   IS             isicol;
4703   PetscErrorCode ierr;
4704   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
4705   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4706   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4707   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
4708   PetscTruth     col_identity,row_identity,both_identity,flg;
4709   PetscReal      f;
4710   PetscTruth     newdatastruct=PETSC_FALSE;
4711 
4712   PetscFunctionBegin;
4713   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
4714   if (newdatastruct){
4715     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4716     PetscFunctionReturn(0);
4717   }
4718 
4719   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
4720   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
4721 
4722   f             = info->fill;
4723   levels        = (PetscInt)info->levels;
4724   diagonal_fill = (PetscInt)info->diagonal_fill;
4725   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4726 
4727   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4728   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4729   both_identity = (PetscTruth) (row_identity && col_identity);
4730 
4731   if (!levels && both_identity) {  /* special case copy the nonzero structure */
4732     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
4733     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4734 
4735     fact->factor = MAT_FACTOR_ILU;
4736     b            = (Mat_SeqBAIJ*)(fact)->data;
4737     b->row       = isrow;
4738     b->col       = iscol;
4739     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4740     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4741     b->icol      = isicol;
4742     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4743     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4744     PetscFunctionReturn(0);
4745   }
4746 
4747   /* general case perform the symbolic factorization */
4748     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4749     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4750 
4751     /* get new row pointers */
4752     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
4753     ainew[0] = 0;
4754     /* don't know how many column pointers are needed so estimate */
4755     jmax = (PetscInt)(f*ai[n] + 1);
4756     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
4757     /* ajfill is level of fill for each fill entry */
4758     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
4759     /* fill is a linked list of nonzeros in active row */
4760     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
4761     /* im is level for each filled value */
4762     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
4763     /* dloc is location of diagonal in factor */
4764     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
4765     dloc[0]  = 0;
4766     for (prow=0; prow<n; prow++) {
4767 
4768       /* copy prow into linked list */
4769       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
4770       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
4771       xi         = aj + ai[r[prow]];
4772       fill[n]    = n;
4773       fill[prow] = -1; /* marker for diagonal entry */
4774       while (nz--) {
4775 	fm  = n;
4776 	idx = ic[*xi++];
4777 	do {
4778 	  m  = fm;
4779 	  fm = fill[m];
4780 	} while (fm < idx);
4781 	fill[m]   = idx;
4782 	fill[idx] = fm;
4783 	im[idx]   = 0;
4784       }
4785 
4786       /* make sure diagonal entry is included */
4787       if (diagonal_fill && fill[prow] == -1) {
4788 	fm = n;
4789 	while (fill[fm] < prow) fm = fill[fm];
4790 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4791 	fill[fm]   = prow;
4792 	im[prow]   = 0;
4793 	nzf++;
4794 	dcount++;
4795       }
4796 
4797       nzi = 0;
4798       row = fill[n];
4799       while (row < prow) {
4800 	incrlev = im[row] + 1;
4801 	nz      = dloc[row];
4802 	xi      = ajnew  + ainew[row] + nz + 1;
4803 	flev    = ajfill + ainew[row] + nz + 1;
4804 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
4805 	fm      = row;
4806 	while (nnz-- > 0) {
4807 	  idx = *xi++;
4808 	  if (*flev + incrlev > levels) {
4809 	    flev++;
4810 	    continue;
4811 	  }
4812 	  do {
4813 	    m  = fm;
4814 	    fm = fill[m];
4815 	  } while (fm < idx);
4816 	  if (fm != idx) {
4817 	    im[idx]   = *flev + incrlev;
4818 	    fill[m]   = idx;
4819 	    fill[idx] = fm;
4820 	    fm        = idx;
4821 	    nzf++;
4822 	  } else {
4823 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
4824 	  }
4825 	  flev++;
4826 	}
4827 	row = fill[row];
4828 	nzi++;
4829       }
4830       /* copy new filled row into permanent storage */
4831       ainew[prow+1] = ainew[prow] + nzf;
4832       if (ainew[prow+1] > jmax) {
4833 
4834 	/* estimate how much additional space we will need */
4835 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4836 	/* just double the memory each time */
4837 	PetscInt maxadd = jmax;
4838 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
4839 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
4840 	jmax += maxadd;
4841 
4842 	/* allocate a longer ajnew and ajfill */
4843 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4844 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4845 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
4846 	ajnew = xitmp;
4847 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4848 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4849 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
4850 	ajfill = xitmp;
4851 	reallocate++; /* count how many reallocations are needed */
4852       }
4853       xitmp       = ajnew + ainew[prow];
4854       flev        = ajfill + ainew[prow];
4855       dloc[prow]  = nzi;
4856       fm          = fill[n];
4857       while (nzf--) {
4858 	*xitmp++ = fm;
4859 	*flev++ = im[fm];
4860 	fm      = fill[fm];
4861       }
4862       /* make sure row has diagonal entry */
4863       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
4864 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4865     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4866       }
4867     }
4868     ierr = PetscFree(ajfill);CHKERRQ(ierr);
4869     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4870     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4871     ierr = PetscFree(fill);CHKERRQ(ierr);
4872     ierr = PetscFree(im);CHKERRQ(ierr);
4873 
4874 #if defined(PETSC_USE_INFO)
4875     {
4876       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4877       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4878       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4879       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4880       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4881       if (diagonal_fill) {
4882 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4883       }
4884     }
4885 #endif
4886 
4887     /* put together the new matrix */
4888     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4889     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4890     b    = (Mat_SeqBAIJ*)(fact)->data;
4891     b->free_a       = PETSC_TRUE;
4892     b->free_ij      = PETSC_TRUE;
4893     b->singlemalloc = PETSC_FALSE;
4894     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4895     b->j          = ajnew;
4896     b->i          = ainew;
4897     for (i=0; i<n; i++) dloc[i] += ainew[i];
4898     b->diag       = dloc;
4899     b->free_diag  = PETSC_TRUE;
4900     b->ilen       = 0;
4901     b->imax       = 0;
4902     b->row        = isrow;
4903     b->col        = iscol;
4904     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4905     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4906     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4907     b->icol       = isicol;
4908     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4909     /* In b structure:  Free imax, ilen, old a, old j.
4910        Allocate dloc, solve_work, new a, new j */
4911     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
4912     b->maxnz          = b->nz = ainew[n];
4913 
4914     (fact)->info.factor_mallocs    = reallocate;
4915     (fact)->info.fill_ratio_given  = f;
4916     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
4917 
4918   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4919   PetscFunctionReturn(0);
4920 }
4921 
4922 #undef __FUNCT__
4923 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4924 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
4925 {
4926   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
4927   /* int i,*AJ=a->j,nz=a->nz; */
4928   PetscFunctionBegin;
4929   /* Undo Column scaling */
4930 /*    while (nz--) { */
4931 /*      AJ[i] = AJ[i]/4; */
4932 /*    } */
4933   /* This should really invoke a push/pop logic, but we don't have that yet. */
4934   A->ops->setunfactored = PETSC_NULL;
4935   PetscFunctionReturn(0);
4936 }
4937 
4938 #undef __FUNCT__
4939 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4940 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
4941 {
4942   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4943   PetscInt       *AJ=a->j,nz=a->nz;
4944   unsigned short *aj=(unsigned short *)AJ;
4945   PetscFunctionBegin;
4946   /* Is this really necessary? */
4947   while (nz--) {
4948     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
4949   }
4950   A->ops->setunfactored = PETSC_NULL;
4951   PetscFunctionReturn(0);
4952 }
4953 
4954 
4955