xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6929473c87c3d067b830e278ca4437b6bd644cf7)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt       nz,idx,idt,j,i,oidx;
125   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126   MatScalar      *aa=a->a,*v;
127   PetscScalar    s1,s2,x1,x2;
128   PetscScalar    *x,*b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,x1,x2,x3;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode ierr;
244   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
245   PetscInt       *diag = a->diag,oidx;
246   MatScalar      *aa=a->a,*v;
247   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
248   PetscScalar    *x,*b;
249 
250   PetscFunctionBegin;
251   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
252   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254 
255   /* forward solve the U^T */
256   idx = 0;
257   for (i=0; i<n; i++) {
258 
259     v     = aa + 16*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
262     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
263     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
264     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
265     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
266     v += 16;
267 
268     vi    = aj + diag[i] + 1;
269     nz    = ai[i+1] - diag[i] - 1;
270     while (nz--) {
271       oidx = 4*(*vi++);
272       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
273       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
274       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
275       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
276       v  += 16;
277     }
278     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
279     idx += 4;
280   }
281   /* backward solve the L^T */
282   for (i=n-1; i>=0; i--){
283     v    = aa + 16*diag[i] - 16;
284     vi   = aj + diag[i] - 1;
285     nz   = diag[i] - ai[i];
286     idt  = 4*i;
287     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
288     while (nz--) {
289       idx   = 4*(*vi--);
290       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
291       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
292       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
293       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
294       v -= 16;
295     }
296   }
297   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
298   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
299   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
305 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
306 {
307   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
308   PetscErrorCode ierr;
309   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310   PetscInt       *diag = a->diag,oidx;
311   MatScalar      *aa=a->a,*v;
312   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
313   PetscScalar    *x,*b;
314 
315   PetscFunctionBegin;
316   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
317   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
318   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
319 
320   /* forward solve the U^T */
321   idx = 0;
322   for (i=0; i<n; i++) {
323 
324     v     = aa + 25*diag[i];
325     /* multiply by the inverse of the block diagonal */
326     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
328     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
329     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
330     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
331     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
332     v += 25;
333 
334     vi    = aj + diag[i] + 1;
335     nz    = ai[i+1] - diag[i] - 1;
336     while (nz--) {
337       oidx = 5*(*vi++);
338       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
339       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
340       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
341       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
342       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
343       v  += 25;
344     }
345     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
346     idx += 5;
347   }
348   /* backward solve the L^T */
349   for (i=n-1; i>=0; i--){
350     v    = aa + 25*diag[i] - 25;
351     vi   = aj + diag[i] - 1;
352     nz   = diag[i] - ai[i];
353     idt  = 5*i;
354     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
355     while (nz--) {
356       idx   = 5*(*vi--);
357       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
358       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
359       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
360       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
361       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
362       v -= 25;
363     }
364   }
365   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
366   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
367   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
373 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
374 {
375   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
376   PetscErrorCode ierr;
377   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378   PetscInt       *diag = a->diag,oidx;
379   MatScalar      *aa=a->a,*v;
380   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
381   PetscScalar    *x,*b;
382 
383   PetscFunctionBegin;
384   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
385   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387 
388   /* forward solve the U^T */
389   idx = 0;
390   for (i=0; i<n; i++) {
391 
392     v     = aa + 36*diag[i];
393     /* multiply by the inverse of the block diagonal */
394     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
395     x6    = x[5+idx];
396     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
397     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
398     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
399     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
400     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
401     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
402     v += 36;
403 
404     vi    = aj + diag[i] + 1;
405     nz    = ai[i+1] - diag[i] - 1;
406     while (nz--) {
407       oidx = 6*(*vi++);
408       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
409       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
410       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
411       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
412       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
413       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
414       v  += 36;
415     }
416     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
417     x[5+idx] = s6;
418     idx += 6;
419   }
420   /* backward solve the L^T */
421   for (i=n-1; i>=0; i--){
422     v    = aa + 36*diag[i] - 36;
423     vi   = aj + diag[i] - 1;
424     nz   = diag[i] - ai[i];
425     idt  = 6*i;
426     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
427     s6 = x[5+idt];
428     while (nz--) {
429       idx   = 6*(*vi--);
430       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
431       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
432       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
433       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
434       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
435       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
436       v -= 36;
437     }
438   }
439   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
440   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
441   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 
445 #undef __FUNCT__
446 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
447 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
448 {
449   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
450   PetscErrorCode ierr;
451   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
452   PetscInt       *diag = a->diag,oidx;
453   MatScalar      *aa=a->a,*v;
454   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
455   PetscScalar    *x,*b;
456 
457   PetscFunctionBegin;
458   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
459   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
461 
462   /* forward solve the U^T */
463   idx = 0;
464   for (i=0; i<n; i++) {
465 
466     v     = aa + 49*diag[i];
467     /* multiply by the inverse of the block diagonal */
468     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
469     x6    = x[5+idx]; x7 = x[6+idx];
470     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
471     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
472     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
473     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
474     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
475     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
476     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
477     v += 49;
478 
479     vi    = aj + diag[i] + 1;
480     nz    = ai[i+1] - diag[i] - 1;
481     while (nz--) {
482       oidx = 7*(*vi++);
483       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
484       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
485       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
486       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
487       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
488       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
489       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
490       v  += 49;
491     }
492     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
493     x[5+idx] = s6;x[6+idx] = s7;
494     idx += 7;
495   }
496   /* backward solve the L^T */
497   for (i=n-1; i>=0; i--){
498     v    = aa + 49*diag[i] - 49;
499     vi   = aj + diag[i] - 1;
500     nz   = diag[i] - ai[i];
501     idt  = 7*i;
502     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
503     s6 = x[5+idt];s7 = x[6+idt];
504     while (nz--) {
505       idx   = 7*(*vi--);
506       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
507       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
508       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
509       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
510       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
511       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
512       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
513       v -= 49;
514     }
515   }
516   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
519   PetscFunctionReturn(0);
520 }
521 
522 /*---------------------------------------------------------------------------------------------*/
523 #undef __FUNCT__
524 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
525 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
526 {
527   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
528   IS             iscol=a->col,isrow=a->row;
529   PetscErrorCode ierr;
530   const PetscInt *r,*c,*rout,*cout;
531   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
532   PetscInt       *diag = a->diag;
533   MatScalar      *aa=a->a,*v;
534   PetscScalar    s1,*x,*b,*t;
535 
536   PetscFunctionBegin;
537   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
538   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
539   t  = a->solve_work;
540 
541   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
542   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
543 
544   /* copy the b into temp work space according to permutation */
545   for (i=0; i<n; i++) {
546     t[i] = b[c[i]];
547   }
548 
549   /* forward solve the U^T */
550   for (i=0; i<n; i++) {
551 
552     v     = aa + diag[i];
553     /* multiply by the inverse of the block diagonal */
554     s1    = (*v++)*t[i];
555     vi    = aj + diag[i] + 1;
556     nz    = ai[i+1] - diag[i] - 1;
557     while (nz--) {
558       t[*vi++]  -= (*v++)*s1;
559     }
560     t[i]   = s1;
561   }
562   /* backward solve the L^T */
563   for (i=n-1; i>=0; i--){
564     v    = aa + diag[i] - 1;
565     vi   = aj + diag[i] - 1;
566     nz   = diag[i] - ai[i];
567     s1   = t[i];
568     while (nz--) {
569       t[*vi--]   -=  (*v--)*s1;
570     }
571   }
572 
573   /* copy t into x according to permutation */
574   for (i=0; i<n; i++) {
575     x[r[i]]   = t[i];
576   }
577 
578   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
579   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
580   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
581   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
582   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
583   PetscFunctionReturn(0);
584 }
585 
586 #undef __FUNCT__
587 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
588 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
589 {
590   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
591   IS             iscol=a->col,isrow=a->row;
592   PetscErrorCode ierr;
593   const PetscInt *r,*c,*rout,*cout;
594   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
595   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
596   MatScalar      *aa=a->a,*v;
597   PetscScalar    s1,s2,x1,x2;
598   PetscScalar    *x,*b,*t;
599 
600   PetscFunctionBegin;
601   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
602   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
603   t  = a->solve_work;
604 
605   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
606   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
607 
608   /* copy the b into temp work space according to permutation */
609   ii = 0;
610   for (i=0; i<n; i++) {
611     ic      = 2*c[i];
612     t[ii]   = b[ic];
613     t[ii+1] = b[ic+1];
614     ii += 2;
615   }
616 
617   /* forward solve the U^T */
618   idx = 0;
619   for (i=0; i<n; i++) {
620 
621     v     = aa + 4*diag[i];
622     /* multiply by the inverse of the block diagonal */
623     x1    = t[idx];   x2 = t[1+idx];
624     s1 = v[0]*x1  +  v[1]*x2;
625     s2 = v[2]*x1  +  v[3]*x2;
626     v += 4;
627 
628     vi    = aj + diag[i] + 1;
629     nz    = ai[i+1] - diag[i] - 1;
630     while (nz--) {
631       oidx = 2*(*vi++);
632       t[oidx]   -= v[0]*s1  +  v[1]*s2;
633       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
634       v  += 4;
635     }
636     t[idx]   = s1;t[1+idx] = s2;
637     idx += 2;
638   }
639   /* backward solve the L^T */
640   for (i=n-1; i>=0; i--){
641     v    = aa + 4*diag[i] - 4;
642     vi   = aj + diag[i] - 1;
643     nz   = diag[i] - ai[i];
644     idt  = 2*i;
645     s1 = t[idt];  s2 = t[1+idt];
646     while (nz--) {
647       idx   = 2*(*vi--);
648       t[idx]   -=  v[0]*s1 +  v[1]*s2;
649       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
650       v -= 4;
651     }
652   }
653 
654   /* copy t into x according to permutation */
655   ii = 0;
656   for (i=0; i<n; i++) {
657     ir      = 2*r[i];
658     x[ir]   = t[ii];
659     x[ir+1] = t[ii+1];
660     ii += 2;
661   }
662 
663   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
664   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
665   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
666   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
667   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 #undef __FUNCT__
672 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
673 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
674 {
675   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
676   IS             iscol=a->col,isrow=a->row;
677   PetscErrorCode ierr;
678   const PetscInt *r,*c,*rout,*cout;
679   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
680   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
681   MatScalar      *aa=a->a,*v;
682   PetscScalar    s1,s2,s3,x1,x2,x3;
683   PetscScalar    *x,*b,*t;
684 
685   PetscFunctionBegin;
686   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
687   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
688   t  = a->solve_work;
689 
690   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
691   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
692 
693   /* copy the b into temp work space according to permutation */
694   ii = 0;
695   for (i=0; i<n; i++) {
696     ic      = 3*c[i];
697     t[ii]   = b[ic];
698     t[ii+1] = b[ic+1];
699     t[ii+2] = b[ic+2];
700     ii += 3;
701   }
702 
703   /* forward solve the U^T */
704   idx = 0;
705   for (i=0; i<n; i++) {
706 
707     v     = aa + 9*diag[i];
708     /* multiply by the inverse of the block diagonal */
709     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
710     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
711     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
712     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
713     v += 9;
714 
715     vi    = aj + diag[i] + 1;
716     nz    = ai[i+1] - diag[i] - 1;
717     while (nz--) {
718       oidx = 3*(*vi++);
719       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
720       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
721       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
722       v  += 9;
723     }
724     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
725     idx += 3;
726   }
727   /* backward solve the L^T */
728   for (i=n-1; i>=0; i--){
729     v    = aa + 9*diag[i] - 9;
730     vi   = aj + diag[i] - 1;
731     nz   = diag[i] - ai[i];
732     idt  = 3*i;
733     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
734     while (nz--) {
735       idx   = 3*(*vi--);
736       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
737       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
738       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
739       v -= 9;
740     }
741   }
742 
743   /* copy t into x according to permutation */
744   ii = 0;
745   for (i=0; i<n; i++) {
746     ir      = 3*r[i];
747     x[ir]   = t[ii];
748     x[ir+1] = t[ii+1];
749     x[ir+2] = t[ii+2];
750     ii += 3;
751   }
752 
753   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
754   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
755   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
756   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
757   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
758   PetscFunctionReturn(0);
759 }
760 
761 #undef __FUNCT__
762 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
763 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
764 {
765   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
766   IS             iscol=a->col,isrow=a->row;
767   PetscErrorCode ierr;
768   const PetscInt *r,*c,*rout,*cout;
769   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
770   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
771   MatScalar      *aa=a->a,*v;
772   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
773   PetscScalar    *x,*b,*t;
774 
775   PetscFunctionBegin;
776   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
777   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
778   t  = a->solve_work;
779 
780   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
781   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
782 
783   /* copy the b into temp work space according to permutation */
784   ii = 0;
785   for (i=0; i<n; i++) {
786     ic      = 4*c[i];
787     t[ii]   = b[ic];
788     t[ii+1] = b[ic+1];
789     t[ii+2] = b[ic+2];
790     t[ii+3] = b[ic+3];
791     ii += 4;
792   }
793 
794   /* forward solve the U^T */
795   idx = 0;
796   for (i=0; i<n; i++) {
797 
798     v     = aa + 16*diag[i];
799     /* multiply by the inverse of the block diagonal */
800     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
801     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
802     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
803     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
804     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
805     v += 16;
806 
807     vi    = aj + diag[i] + 1;
808     nz    = ai[i+1] - diag[i] - 1;
809     while (nz--) {
810       oidx = 4*(*vi++);
811       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
812       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
813       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
814       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
815       v  += 16;
816     }
817     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
818     idx += 4;
819   }
820   /* backward solve the L^T */
821   for (i=n-1; i>=0; i--){
822     v    = aa + 16*diag[i] - 16;
823     vi   = aj + diag[i] - 1;
824     nz   = diag[i] - ai[i];
825     idt  = 4*i;
826     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
827     while (nz--) {
828       idx   = 4*(*vi--);
829       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
830       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
831       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
832       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
833       v -= 16;
834     }
835   }
836 
837   /* copy t into x according to permutation */
838   ii = 0;
839   for (i=0; i<n; i++) {
840     ir      = 4*r[i];
841     x[ir]   = t[ii];
842     x[ir+1] = t[ii+1];
843     x[ir+2] = t[ii+2];
844     x[ir+3] = t[ii+3];
845     ii += 4;
846   }
847 
848   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
849   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
850   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
851   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
852   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
853   PetscFunctionReturn(0);
854 }
855 
856 #undef __FUNCT__
857 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
858 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
859 {
860   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
861   IS             iscol=a->col,isrow=a->row;
862   PetscErrorCode ierr;
863   const PetscInt *r,*c,*rout,*cout;
864   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
865   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
866   MatScalar      *aa=a->a,*v;
867   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
868   PetscScalar    *x,*b,*t;
869 
870   PetscFunctionBegin;
871   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
872   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
873   t  = a->solve_work;
874 
875   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
876   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
877 
878   /* copy the b into temp work space according to permutation */
879   ii = 0;
880   for (i=0; i<n; i++) {
881     ic      = 5*c[i];
882     t[ii]   = b[ic];
883     t[ii+1] = b[ic+1];
884     t[ii+2] = b[ic+2];
885     t[ii+3] = b[ic+3];
886     t[ii+4] = b[ic+4];
887     ii += 5;
888   }
889 
890   /* forward solve the U^T */
891   idx = 0;
892   for (i=0; i<n; i++) {
893 
894     v     = aa + 25*diag[i];
895     /* multiply by the inverse of the block diagonal */
896     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
897     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
898     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
899     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
900     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
901     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
902     v += 25;
903 
904     vi    = aj + diag[i] + 1;
905     nz    = ai[i+1] - diag[i] - 1;
906     while (nz--) {
907       oidx = 5*(*vi++);
908       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
909       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
910       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
911       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
912       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
913       v  += 25;
914     }
915     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
916     idx += 5;
917   }
918   /* backward solve the L^T */
919   for (i=n-1; i>=0; i--){
920     v    = aa + 25*diag[i] - 25;
921     vi   = aj + diag[i] - 1;
922     nz   = diag[i] - ai[i];
923     idt  = 5*i;
924     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
925     while (nz--) {
926       idx   = 5*(*vi--);
927       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
928       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
929       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
930       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
931       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
932       v -= 25;
933     }
934   }
935 
936   /* copy t into x according to permutation */
937   ii = 0;
938   for (i=0; i<n; i++) {
939     ir      = 5*r[i];
940     x[ir]   = t[ii];
941     x[ir+1] = t[ii+1];
942     x[ir+2] = t[ii+2];
943     x[ir+3] = t[ii+3];
944     x[ir+4] = t[ii+4];
945     ii += 5;
946   }
947 
948   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
949   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
950   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
951   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
952   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
953   PetscFunctionReturn(0);
954 }
955 
956 #undef __FUNCT__
957 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
958 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
959 {
960   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
961   IS             iscol=a->col,isrow=a->row;
962   PetscErrorCode ierr;
963   const PetscInt *r,*c,*rout,*cout;
964   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
965   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
966   MatScalar      *aa=a->a,*v;
967   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
968   PetscScalar    *x,*b,*t;
969 
970   PetscFunctionBegin;
971   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
973   t  = a->solve_work;
974 
975   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
976   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
977 
978   /* copy the b into temp work space according to permutation */
979   ii = 0;
980   for (i=0; i<n; i++) {
981     ic      = 6*c[i];
982     t[ii]   = b[ic];
983     t[ii+1] = b[ic+1];
984     t[ii+2] = b[ic+2];
985     t[ii+3] = b[ic+3];
986     t[ii+4] = b[ic+4];
987     t[ii+5] = b[ic+5];
988     ii += 6;
989   }
990 
991   /* forward solve the U^T */
992   idx = 0;
993   for (i=0; i<n; i++) {
994 
995     v     = aa + 36*diag[i];
996     /* multiply by the inverse of the block diagonal */
997     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
998     x6    = t[5+idx];
999     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1000     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1001     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1002     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1003     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1004     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1005     v += 36;
1006 
1007     vi    = aj + diag[i] + 1;
1008     nz    = ai[i+1] - diag[i] - 1;
1009     while (nz--) {
1010       oidx = 6*(*vi++);
1011       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1012       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1013       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1014       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1015       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1016       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1017       v  += 36;
1018     }
1019     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1020     t[5+idx] = s6;
1021     idx += 6;
1022   }
1023   /* backward solve the L^T */
1024   for (i=n-1; i>=0; i--){
1025     v    = aa + 36*diag[i] - 36;
1026     vi   = aj + diag[i] - 1;
1027     nz   = diag[i] - ai[i];
1028     idt  = 6*i;
1029     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1030     s6 = t[5+idt];
1031     while (nz--) {
1032       idx   = 6*(*vi--);
1033       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1034       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1035       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1036       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1037       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1038       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1039       v -= 36;
1040     }
1041   }
1042 
1043   /* copy t into x according to permutation */
1044   ii = 0;
1045   for (i=0; i<n; i++) {
1046     ir      = 6*r[i];
1047     x[ir]   = t[ii];
1048     x[ir+1] = t[ii+1];
1049     x[ir+2] = t[ii+2];
1050     x[ir+3] = t[ii+3];
1051     x[ir+4] = t[ii+4];
1052     x[ir+5] = t[ii+5];
1053     ii += 6;
1054   }
1055 
1056   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1057   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1058   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1059   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1060   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 #undef __FUNCT__
1065 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1066 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1067 {
1068   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1069   IS             iscol=a->col,isrow=a->row;
1070   PetscErrorCode ierr;
1071   const PetscInt *r,*c,*rout,*cout;
1072   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1073   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1074   MatScalar      *aa=a->a,*v;
1075   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1076   PetscScalar    *x,*b,*t;
1077 
1078   PetscFunctionBegin;
1079   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1080   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1081   t  = a->solve_work;
1082 
1083   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1084   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1085 
1086   /* copy the b into temp work space according to permutation */
1087   ii = 0;
1088   for (i=0; i<n; i++) {
1089     ic      = 7*c[i];
1090     t[ii]   = b[ic];
1091     t[ii+1] = b[ic+1];
1092     t[ii+2] = b[ic+2];
1093     t[ii+3] = b[ic+3];
1094     t[ii+4] = b[ic+4];
1095     t[ii+5] = b[ic+5];
1096     t[ii+6] = b[ic+6];
1097     ii += 7;
1098   }
1099 
1100   /* forward solve the U^T */
1101   idx = 0;
1102   for (i=0; i<n; i++) {
1103 
1104     v     = aa + 49*diag[i];
1105     /* multiply by the inverse of the block diagonal */
1106     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1107     x6    = t[5+idx]; x7 = t[6+idx];
1108     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1109     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1110     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1111     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1112     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1113     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1114     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1115     v += 49;
1116 
1117     vi    = aj + diag[i] + 1;
1118     nz    = ai[i+1] - diag[i] - 1;
1119     while (nz--) {
1120       oidx = 7*(*vi++);
1121       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1122       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1123       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1124       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1125       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1126       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1127       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1128       v  += 49;
1129     }
1130     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1131     t[5+idx] = s6;t[6+idx] = s7;
1132     idx += 7;
1133   }
1134   /* backward solve the L^T */
1135   for (i=n-1; i>=0; i--){
1136     v    = aa + 49*diag[i] - 49;
1137     vi   = aj + diag[i] - 1;
1138     nz   = diag[i] - ai[i];
1139     idt  = 7*i;
1140     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1141     s6 = t[5+idt];s7 = t[6+idt];
1142     while (nz--) {
1143       idx   = 7*(*vi--);
1144       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1145       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1146       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1147       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1148       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1149       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1150       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1151       v -= 49;
1152     }
1153   }
1154 
1155   /* copy t into x according to permutation */
1156   ii = 0;
1157   for (i=0; i<n; i++) {
1158     ir      = 7*r[i];
1159     x[ir]   = t[ii];
1160     x[ir+1] = t[ii+1];
1161     x[ir+2] = t[ii+2];
1162     x[ir+3] = t[ii+3];
1163     x[ir+4] = t[ii+4];
1164     x[ir+5] = t[ii+5];
1165     x[ir+6] = t[ii+6];
1166     ii += 7;
1167   }
1168 
1169   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1170   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1173   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 /* ----------------------------------------------------------- */
1178 #undef __FUNCT__
1179 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1180 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1181 {
1182   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1183   IS             iscol=a->col,isrow=a->row;
1184   PetscErrorCode ierr;
1185   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1186   PetscInt       i,n=a->mbs;
1187   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1188   MatScalar      *aa=a->a,*v;
1189   PetscScalar    *x,*b,*s,*t,*ls;
1190 
1191   PetscFunctionBegin;
1192   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1193   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1194   t  = a->solve_work;
1195 
1196   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1197   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1198 
1199   /* forward solve the lower triangular */
1200   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1201   for (i=1; i<n; i++) {
1202     v   = aa + bs2*ai[i];
1203     vi  = aj + ai[i];
1204     nz  = a->diag[i] - ai[i];
1205     s = t + bs*i;
1206     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1207     while (nz--) {
1208       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1209       v += bs2;
1210     }
1211   }
1212   /* backward solve the upper triangular */
1213   ls = a->solve_work + A->cmap->n;
1214   for (i=n-1; i>=0; i--){
1215     v   = aa + bs2*(a->diag[i] + 1);
1216     vi  = aj + a->diag[i] + 1;
1217     nz  = ai[i+1] - a->diag[i] - 1;
1218     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1219     while (nz--) {
1220       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1221       v += bs2;
1222     }
1223     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1224     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1225   }
1226 
1227   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1228   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1229   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1230   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1231   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 /* ----------------------------------------------------------- */
1236 #undef __FUNCT__
1237 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
1238 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1239 {
1240   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1241   IS                iscol=a->col,isrow=a->row;
1242   PetscErrorCode    ierr;
1243   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1244   PetscInt          i,n=a->mbs,j;
1245   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1246   const MatScalar   *aa=a->a,*v;
1247   PetscScalar       *x,*t,*ls;
1248   const PetscScalar *b;
1249   PetscFunctionBegin;
1250   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1251   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1252   t    = a->solve_work;
1253 
1254   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1255   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1256 
1257   /* copy the b into temp work space according to permutation */
1258   for (i=0; i<n; i++) {
1259     for (j=0; j<bs; j++) {
1260       t[i*bs+j] = b[c[i]*bs+j];
1261     }
1262   }
1263 
1264 
1265   /* forward solve the upper triangular transpose */
1266   ls = a->solve_work + A->cmap->n;
1267   for (i=0; i<n; i++){
1268     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1269     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1270     v   = aa + bs2*(a->diag[i] + 1);
1271     vi  = aj + a->diag[i] + 1;
1272     nz  = ai[i+1] - a->diag[i] - 1;
1273     while (nz--) {
1274       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1275       v += bs2;
1276     }
1277   }
1278 
1279   /* backward solve the lower triangular transpose */
1280   for (i=n-1; i>=0; i--) {
1281     v   = aa + bs2*ai[i];
1282     vi  = aj + ai[i];
1283     nz  = a->diag[i] - ai[i];
1284     while (nz--) {
1285       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1286       v += bs2;
1287     }
1288   }
1289 
1290   /* copy t into x according to permutation */
1291   for (i=0; i<n; i++) {
1292     for (j=0; j<bs; j++) {
1293       x[bs*r[i]+j]   = t[bs*i+j];
1294     }
1295   }
1296 
1297   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1298   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1299   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1301   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1302   PetscFunctionReturn(0);
1303 }
1304 
1305 #undef __FUNCT__
1306 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1307 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1308 {
1309   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1310   IS             iscol=a->col,isrow=a->row;
1311   PetscErrorCode ierr;
1312   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1313   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1314   MatScalar      *aa=a->a,*v;
1315   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1316   PetscScalar    *x,*b,*t;
1317 
1318   PetscFunctionBegin;
1319   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1320   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1321   t  = a->solve_work;
1322 
1323   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1324   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1325 
1326   /* forward solve the lower triangular */
1327   idx    = 7*(*r++);
1328   t[0] = b[idx];   t[1] = b[1+idx];
1329   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1330   t[5] = b[5+idx]; t[6] = b[6+idx];
1331 
1332   for (i=1; i<n; i++) {
1333     v     = aa + 49*ai[i];
1334     vi    = aj + ai[i];
1335     nz    = diag[i] - ai[i];
1336     idx   = 7*(*r++);
1337     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1338     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1339     while (nz--) {
1340       idx   = 7*(*vi++);
1341       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1342       x4    = t[3+idx];x5 = t[4+idx];
1343       x6    = t[5+idx];x7 = t[6+idx];
1344       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1345       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1346       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1347       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1348       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1349       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1350       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1351       v += 49;
1352     }
1353     idx = 7*i;
1354     t[idx]   = s1;t[1+idx] = s2;
1355     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1356     t[5+idx] = s6;t[6+idx] = s7;
1357   }
1358   /* backward solve the upper triangular */
1359   for (i=n-1; i>=0; i--){
1360     v    = aa + 49*diag[i] + 49;
1361     vi   = aj + diag[i] + 1;
1362     nz   = ai[i+1] - diag[i] - 1;
1363     idt  = 7*i;
1364     s1 = t[idt];  s2 = t[1+idt];
1365     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1366     s6 = t[5+idt];s7 = t[6+idt];
1367     while (nz--) {
1368       idx   = 7*(*vi++);
1369       x1    = t[idx];   x2 = t[1+idx];
1370       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1371       x6    = t[5+idx]; x7 = t[6+idx];
1372       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1373       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1374       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1375       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1376       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1377       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1378       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1379       v += 49;
1380     }
1381     idc = 7*(*c--);
1382     v   = aa + 49*diag[i];
1383     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1384                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1385     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1386                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1387     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1388                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1389     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1390                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1391     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1392                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1393     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1394                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1395     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1396                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1397   }
1398 
1399   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1400   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1401   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1402   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1403   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 #undef __FUNCT__
1408 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1409 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1410 {
1411   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1412   IS             iscol=a->col,isrow=a->row;
1413   PetscErrorCode ierr;
1414   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1415   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1416   MatScalar      *aa=a->a,*v;
1417   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1418   PetscScalar    *x,*b,*t;
1419 
1420   PetscFunctionBegin;
1421   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1422   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1423   t  = a->solve_work;
1424 
1425   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1426   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1427 
1428   /* forward solve the lower triangular */
1429   idx    = 7*r[0];
1430   t[0] = b[idx];   t[1] = b[1+idx];
1431   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1432   t[5] = b[5+idx]; t[6] = b[6+idx];
1433 
1434   for (i=1; i<n; i++) {
1435     v     = aa + 49*ai[i];
1436     vi    = aj + ai[i];
1437     nz    = ai[i+1] - ai[i];
1438     idx   = 7*r[i];
1439     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1440     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1441     for(m=0;m<nz;m++){
1442       idx   = 7*vi[m];
1443       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1444       x4    = t[3+idx];x5 = t[4+idx];
1445       x6    = t[5+idx];x7 = t[6+idx];
1446       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1447       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1448       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1449       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1450       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1451       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1452       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1453       v += 49;
1454     }
1455     idx = 7*i;
1456     t[idx]   = s1;t[1+idx] = s2;
1457     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1458     t[5+idx] = s6;t[6+idx] = s7;
1459   }
1460   /* backward solve the upper triangular */
1461   for (i=n-1; i>=0; i--){
1462     v    = aa + 49*(adiag[i+1]+1);
1463     vi   = aj + adiag[i+1]+1;
1464     nz   = adiag[i] - adiag[i+1] - 1;
1465     idt  = 7*i;
1466     s1 = t[idt];  s2 = t[1+idt];
1467     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1468     s6 = t[5+idt];s7 = t[6+idt];
1469     for(m=0;m<nz;m++){
1470       idx   = 7*vi[m];
1471       x1    = t[idx];   x2 = t[1+idx];
1472       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1473       x6    = t[5+idx]; x7 = t[6+idx];
1474       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1475       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1476       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1477       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1478       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1479       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1480       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1481       v += 49;
1482     }
1483     idc = 7*c[i];
1484     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1485                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1486     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1487                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1488     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1489                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1490     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1491                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1492     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1493                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1494     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1495                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1496     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1497                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1498   }
1499 
1500   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1501   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1502   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1503   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1504   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1505   PetscFunctionReturn(0);
1506 }
1507 
1508 #undef __FUNCT__
1509 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1510 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1511 {
1512   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1513   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1514   PetscErrorCode    ierr;
1515   PetscInt          *diag = a->diag,jdx;
1516   const MatScalar   *aa=a->a,*v;
1517   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1518   const PetscScalar *b;
1519 
1520   PetscFunctionBegin;
1521   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1522   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1523   /* forward solve the lower triangular */
1524   idx    = 0;
1525   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1526   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1527   x[6] = b[6+idx];
1528   for (i=1; i<n; i++) {
1529     v     =  aa + 49*ai[i];
1530     vi    =  aj + ai[i];
1531     nz    =  diag[i] - ai[i];
1532     idx   =  7*i;
1533     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1534     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1535     s7  =  b[6+idx];
1536     while (nz--) {
1537       jdx   = 7*(*vi++);
1538       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1539       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1540       x7    = x[6+jdx];
1541       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1542       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1543       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1544       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1545       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1546       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1547       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1548       v += 49;
1549      }
1550     x[idx]   = s1;
1551     x[1+idx] = s2;
1552     x[2+idx] = s3;
1553     x[3+idx] = s4;
1554     x[4+idx] = s5;
1555     x[5+idx] = s6;
1556     x[6+idx] = s7;
1557   }
1558   /* backward solve the upper triangular */
1559   for (i=n-1; i>=0; i--){
1560     v    = aa + 49*diag[i] + 49;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 7*i;
1564     s1 = x[idt];   s2 = x[1+idt];
1565     s3 = x[2+idt]; s4 = x[3+idt];
1566     s5 = x[4+idt]; s6 = x[5+idt];
1567     s7 = x[6+idt];
1568     while (nz--) {
1569       idx   = 7*(*vi++);
1570       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1571       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1572       x7    = x[6+idx];
1573       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1574       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1575       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1576       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1577       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1578       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1579       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1580       v += 49;
1581     }
1582     v        = aa + 49*diag[i];
1583     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1584                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1585     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1586                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1587     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1588                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1589     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1590                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1591     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1592                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1593     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1594                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1595     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1596                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1597   }
1598 
1599   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1600   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1601   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 #undef __FUNCT__
1606 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1607 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1608 {
1609     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1610     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1611     PetscErrorCode    ierr;
1612     PetscInt          idx,jdx,idt;
1613     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1614     const MatScalar   *aa=a->a,*v;
1615     PetscScalar       *x;
1616     const PetscScalar *b;
1617     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1618 
1619     PetscFunctionBegin;
1620     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1621     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1622     /* forward solve the lower triangular */
1623     idx    = 0;
1624     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1625     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1626     for (i=1; i<n; i++) {
1627        v    = aa + bs2*ai[i];
1628        vi   = aj + ai[i];
1629        nz   = ai[i+1] - ai[i];
1630       idx   = bs*i;
1631        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1632        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1633        for(k=0;k<nz;k++) {
1634           jdx   = bs*vi[k];
1635           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1636 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1637           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1638           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1639           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1640 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1641           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1642 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1643 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1644           v   +=  bs2;
1645         }
1646 
1647        x[idx]   = s1;
1648        x[1+idx] = s2;
1649        x[2+idx] = s3;
1650        x[3+idx] = s4;
1651        x[4+idx] = s5;
1652        x[5+idx] = s6;
1653        x[6+idx] = s7;
1654     }
1655 
1656    /* backward solve the upper triangular */
1657   for (i=n-1; i>=0; i--){
1658     v   = aa + bs2*(adiag[i+1]+1);
1659      vi  = aj + adiag[i+1]+1;
1660      nz  = adiag[i] - adiag[i+1]-1;
1661      idt = bs*i;
1662      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1663      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1664     for(k=0;k<nz;k++) {
1665       idx   = bs*vi[k];
1666        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1667        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1668        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1669        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1670        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1671        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1672        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1673        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1674        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1675         v   +=  bs2;
1676     }
1677     /* x = inv_diagonal*x */
1678     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1679     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1680     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1681     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1682     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1683     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1684     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1685   }
1686 
1687   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1688   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1689   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 #undef __FUNCT__
1694 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1695 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1696 {
1697   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1698   IS                iscol=a->col,isrow=a->row;
1699   PetscErrorCode    ierr;
1700   const PetscInt    *r,*c,*rout,*cout;
1701   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1702   const MatScalar   *aa=a->a,*v;
1703   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1704   const PetscScalar *b;
1705   PetscFunctionBegin;
1706   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708   t  = a->solve_work;
1709 
1710   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1711   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1712 
1713   /* forward solve the lower triangular */
1714   idx    = 6*(*r++);
1715   t[0] = b[idx];   t[1] = b[1+idx];
1716   t[2] = b[2+idx]; t[3] = b[3+idx];
1717   t[4] = b[4+idx]; t[5] = b[5+idx];
1718   for (i=1; i<n; i++) {
1719     v     = aa + 36*ai[i];
1720     vi    = aj + ai[i];
1721     nz    = diag[i] - ai[i];
1722     idx   = 6*(*r++);
1723     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1724     s5  = b[4+idx]; s6 = b[5+idx];
1725     while (nz--) {
1726       idx   = 6*(*vi++);
1727       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1728       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1729       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1730       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1731       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1732       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1733       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1734       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1735       v += 36;
1736     }
1737     idx = 6*i;
1738     t[idx]   = s1;t[1+idx] = s2;
1739     t[2+idx] = s3;t[3+idx] = s4;
1740     t[4+idx] = s5;t[5+idx] = s6;
1741   }
1742   /* backward solve the upper triangular */
1743   for (i=n-1; i>=0; i--){
1744     v    = aa + 36*diag[i] + 36;
1745     vi   = aj + diag[i] + 1;
1746     nz   = ai[i+1] - diag[i] - 1;
1747     idt  = 6*i;
1748     s1 = t[idt];  s2 = t[1+idt];
1749     s3 = t[2+idt];s4 = t[3+idt];
1750     s5 = t[4+idt];s6 = t[5+idt];
1751     while (nz--) {
1752       idx   = 6*(*vi++);
1753       x1    = t[idx];   x2 = t[1+idx];
1754       x3    = t[2+idx]; x4 = t[3+idx];
1755       x5    = t[4+idx]; x6 = t[5+idx];
1756       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1757       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1758       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1759       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1760       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1761       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1762       v += 36;
1763     }
1764     idc = 6*(*c--);
1765     v   = aa + 36*diag[i];
1766     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1767                                  v[18]*s4+v[24]*s5+v[30]*s6;
1768     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1769                                  v[19]*s4+v[25]*s5+v[31]*s6;
1770     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1771                                  v[20]*s4+v[26]*s5+v[32]*s6;
1772     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1773                                  v[21]*s4+v[27]*s5+v[33]*s6;
1774     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1775                                  v[22]*s4+v[28]*s5+v[34]*s6;
1776     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1777                                  v[23]*s4+v[29]*s5+v[35]*s6;
1778   }
1779 
1780   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1781   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1782   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1783   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1784   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1785   PetscFunctionReturn(0);
1786 }
1787 
1788 #undef __FUNCT__
1789 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1790 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1791 {
1792   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1793   IS                iscol=a->col,isrow=a->row;
1794   PetscErrorCode    ierr;
1795   const PetscInt    *r,*c,*rout,*cout;
1796   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
1797   const MatScalar   *aa=a->a,*v;
1798   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1799   const PetscScalar *b;
1800   PetscFunctionBegin;
1801   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1802   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1803   t  = a->solve_work;
1804 
1805   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1806   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1807 
1808   /* forward solve the lower triangular */
1809   idx    = 6*r[0];
1810   t[0] = b[idx];   t[1] = b[1+idx];
1811   t[2] = b[2+idx]; t[3] = b[3+idx];
1812   t[4] = b[4+idx]; t[5] = b[5+idx];
1813   for (i=1; i<n; i++) {
1814     v     = aa + 36*ai[i];
1815     vi    = aj + ai[i];
1816     nz    = ai[i+1] - ai[i];
1817     idx   = 6*r[i];
1818     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1819     s5  = b[4+idx]; s6 = b[5+idx];
1820     for(m=0;m<nz;m++){
1821       idx   = 6*vi[m];
1822       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1823       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1824       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1825       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1826       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1827       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1828       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1829       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1830       v += 36;
1831     }
1832     idx = 6*i;
1833     t[idx]   = s1;t[1+idx] = s2;
1834     t[2+idx] = s3;t[3+idx] = s4;
1835     t[4+idx] = s5;t[5+idx] = s6;
1836   }
1837   /* backward solve the upper triangular */
1838   for (i=n-1; i>=0; i--){
1839     v    = aa + 36*(adiag[i+1]+1);
1840     vi   = aj + adiag[i+1]+1;
1841     nz   = adiag[i] - adiag[i+1] - 1;
1842     idt  = 6*i;
1843     s1 = t[idt];  s2 = t[1+idt];
1844     s3 = t[2+idt];s4 = t[3+idt];
1845     s5 = t[4+idt];s6 = t[5+idt];
1846     for(m=0;m<nz;m++){
1847       idx   = 6*vi[m];
1848       x1    = t[idx];   x2 = t[1+idx];
1849       x3    = t[2+idx]; x4 = t[3+idx];
1850       x5    = t[4+idx]; x6 = t[5+idx];
1851       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1852       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1853       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1854       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1855       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1856       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1857       v += 36;
1858     }
1859     idc = 6*c[i];
1860     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1861                                  v[18]*s4+v[24]*s5+v[30]*s6;
1862     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1863                                  v[19]*s4+v[25]*s5+v[31]*s6;
1864     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1865                                  v[20]*s4+v[26]*s5+v[32]*s6;
1866     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1867                                  v[21]*s4+v[27]*s5+v[33]*s6;
1868     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1869                                  v[22]*s4+v[28]*s5+v[34]*s6;
1870     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1871                                  v[23]*s4+v[29]*s5+v[35]*s6;
1872   }
1873 
1874   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1876   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1878   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 #undef __FUNCT__
1883 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1884 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1885 {
1886   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1887   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1888   PetscErrorCode    ierr;
1889   PetscInt          *diag = a->diag,jdx;
1890   const MatScalar   *aa=a->a,*v;
1891   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1892   const PetscScalar *b;
1893 
1894   PetscFunctionBegin;
1895   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1897   /* forward solve the lower triangular */
1898   idx    = 0;
1899   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1900   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1901   for (i=1; i<n; i++) {
1902     v     =  aa + 36*ai[i];
1903     vi    =  aj + ai[i];
1904     nz    =  diag[i] - ai[i];
1905     idx   =  6*i;
1906     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1907     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1908     while (nz--) {
1909       jdx   = 6*(*vi++);
1910       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1911       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1912       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1913       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1914       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1915       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1916       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1917       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1918       v += 36;
1919      }
1920     x[idx]   = s1;
1921     x[1+idx] = s2;
1922     x[2+idx] = s3;
1923     x[3+idx] = s4;
1924     x[4+idx] = s5;
1925     x[5+idx] = s6;
1926   }
1927   /* backward solve the upper triangular */
1928   for (i=n-1; i>=0; i--){
1929     v    = aa + 36*diag[i] + 36;
1930     vi   = aj + diag[i] + 1;
1931     nz   = ai[i+1] - diag[i] - 1;
1932     idt  = 6*i;
1933     s1 = x[idt];   s2 = x[1+idt];
1934     s3 = x[2+idt]; s4 = x[3+idt];
1935     s5 = x[4+idt]; s6 = x[5+idt];
1936     while (nz--) {
1937       idx   = 6*(*vi++);
1938       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1939       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1940       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1941       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1942       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1943       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1944       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1945       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1946       v += 36;
1947     }
1948     v        = aa + 36*diag[i];
1949     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1950     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1951     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1952     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1953     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1954     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1955   }
1956 
1957   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1959   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1960   PetscFunctionReturn(0);
1961 }
1962 
1963 #undef __FUNCT__
1964 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1965 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1966 {
1967     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1968     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1969     PetscErrorCode    ierr;
1970     PetscInt          idx,jdx,idt;
1971     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1972     const MatScalar   *aa=a->a,*v;
1973     PetscScalar       *x;
1974     const PetscScalar *b;
1975     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1976 
1977     PetscFunctionBegin;
1978     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1979     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1980     /* forward solve the lower triangular */
1981     idx    = 0;
1982     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1983     x[4] = b[4+idx];x[5] = b[5+idx];
1984     for (i=1; i<n; i++) {
1985        v    = aa + bs2*ai[i];
1986        vi   = aj + ai[i];
1987        nz   = ai[i+1] - ai[i];
1988       idx   = bs*i;
1989        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1990        s5   = b[4+idx];s6 = b[5+idx];
1991        for(k=0;k<nz;k++){
1992           jdx   = bs*vi[k];
1993           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1994 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1995           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1996           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1997           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1998 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1999           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2000 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2001           v   +=  bs2;
2002         }
2003 
2004        x[idx]   = s1;
2005        x[1+idx] = s2;
2006        x[2+idx] = s3;
2007        x[3+idx] = s4;
2008        x[4+idx] = s5;
2009        x[5+idx] = s6;
2010     }
2011 
2012    /* backward solve the upper triangular */
2013   for (i=n-1; i>=0; i--){
2014     v   = aa + bs2*(adiag[i+1]+1);
2015      vi  = aj + adiag[i+1]+1;
2016      nz  = adiag[i] - adiag[i+1]-1;
2017      idt = bs*i;
2018      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2019      s5 = x[4+idt];s6 = x[5+idt];
2020      for(k=0;k<nz;k++){
2021       idx   = bs*vi[k];
2022        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2023        x5    = x[4+idx];x6 = x[5+idx];
2024        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2025        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2026        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2027        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2028        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2029        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2030         v   +=  bs2;
2031     }
2032     /* x = inv_diagonal*x */
2033    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2034    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2035    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2036    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2037    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2038    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2039   }
2040 
2041   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2042   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2043   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 #undef __FUNCT__
2048 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2049 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2050 {
2051   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2052   IS                iscol=a->col,isrow=a->row;
2053   PetscErrorCode    ierr;
2054   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2055   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2056   const MatScalar   *aa=a->a,*v;
2057   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2058   const PetscScalar *b;
2059 
2060   PetscFunctionBegin;
2061   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2062   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2063   t  = a->solve_work;
2064 
2065   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2066   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2067 
2068   /* forward solve the lower triangular */
2069   idx    = 5*(*r++);
2070   t[0] = b[idx];   t[1] = b[1+idx];
2071   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2072   for (i=1; i<n; i++) {
2073     v     = aa + 25*ai[i];
2074     vi    = aj + ai[i];
2075     nz    = diag[i] - ai[i];
2076     idx   = 5*(*r++);
2077     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2078     s5  = b[4+idx];
2079     while (nz--) {
2080       idx   = 5*(*vi++);
2081       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2082       x4    = t[3+idx];x5 = t[4+idx];
2083       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2084       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2085       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2086       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2087       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2088       v += 25;
2089     }
2090     idx = 5*i;
2091     t[idx]   = s1;t[1+idx] = s2;
2092     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2093   }
2094   /* backward solve the upper triangular */
2095   for (i=n-1; i>=0; i--){
2096     v    = aa + 25*diag[i] + 25;
2097     vi   = aj + diag[i] + 1;
2098     nz   = ai[i+1] - diag[i] - 1;
2099     idt  = 5*i;
2100     s1 = t[idt];  s2 = t[1+idt];
2101     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2102     while (nz--) {
2103       idx   = 5*(*vi++);
2104       x1    = t[idx];   x2 = t[1+idx];
2105       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2106       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2107       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2108       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2109       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2110       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2111       v += 25;
2112     }
2113     idc = 5*(*c--);
2114     v   = aa + 25*diag[i];
2115     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2116                                  v[15]*s4+v[20]*s5;
2117     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2118                                  v[16]*s4+v[21]*s5;
2119     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2120                                  v[17]*s4+v[22]*s5;
2121     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2122                                  v[18]*s4+v[23]*s5;
2123     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2124                                  v[19]*s4+v[24]*s5;
2125   }
2126 
2127   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2128   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2129   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2130   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2131   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 #undef __FUNCT__
2136 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2137 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2138 {
2139   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2140   IS                iscol=a->col,isrow=a->row;
2141   PetscErrorCode    ierr;
2142   const PetscInt    *r,*c,*rout,*cout;
2143   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2144   const MatScalar   *aa=a->a,*v;
2145   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2146   const PetscScalar *b;
2147 
2148   PetscFunctionBegin;
2149   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2150   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2151   t  = a->solve_work;
2152 
2153   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2154   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2155 
2156   /* forward solve the lower triangular */
2157   idx    = 5*r[0];
2158   t[0] = b[idx];   t[1] = b[1+idx];
2159   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2160   for (i=1; i<n; i++) {
2161     v     = aa + 25*ai[i];
2162     vi    = aj + ai[i];
2163     nz    = ai[i+1] - ai[i];
2164     idx   = 5*r[i];
2165     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2166     s5  = b[4+idx];
2167     for(m=0;m<nz;m++){
2168       idx   = 5*vi[m];
2169       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2170       x4    = t[3+idx];x5 = t[4+idx];
2171       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2172       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2173       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2174       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2175       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2176       v += 25;
2177     }
2178     idx = 5*i;
2179     t[idx]   = s1;t[1+idx] = s2;
2180     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2181   }
2182   /* backward solve the upper triangular */
2183   for (i=n-1; i>=0; i--){
2184     v    = aa + 25*(adiag[i+1]+1);
2185     vi   = aj + adiag[i+1]+1;
2186     nz   = adiag[i] - adiag[i+1] - 1;
2187     idt  = 5*i;
2188     s1 = t[idt];  s2 = t[1+idt];
2189     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2190     for(m=0;m<nz;m++){
2191       idx   = 5*vi[m];
2192       x1    = t[idx];   x2 = t[1+idx];
2193       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2194       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2195       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2196       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2197       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2198       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2199       v += 25;
2200     }
2201     idc = 5*c[i];
2202     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2203                                  v[15]*s4+v[20]*s5;
2204     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2205                                  v[16]*s4+v[21]*s5;
2206     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2207                                  v[17]*s4+v[22]*s5;
2208     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2209                                  v[18]*s4+v[23]*s5;
2210     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2211                                  v[19]*s4+v[24]*s5;
2212   }
2213 
2214   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2215   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2216   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2217   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2218   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2219   PetscFunctionReturn(0);
2220 }
2221 
2222 #undef __FUNCT__
2223 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2224 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2225 {
2226   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2227   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2228   PetscErrorCode    ierr;
2229   PetscInt          *diag = a->diag,jdx;
2230   const MatScalar   *aa=a->a,*v;
2231   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2232   const PetscScalar *b;
2233 
2234   PetscFunctionBegin;
2235   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2236   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2237   /* forward solve the lower triangular */
2238   idx    = 0;
2239   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2240   for (i=1; i<n; i++) {
2241     v     =  aa + 25*ai[i];
2242     vi    =  aj + ai[i];
2243     nz    =  diag[i] - ai[i];
2244     idx   =  5*i;
2245     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2246     while (nz--) {
2247       jdx   = 5*(*vi++);
2248       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2249       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2250       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2251       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2252       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2253       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2254       v    += 25;
2255     }
2256     x[idx]   = s1;
2257     x[1+idx] = s2;
2258     x[2+idx] = s3;
2259     x[3+idx] = s4;
2260     x[4+idx] = s5;
2261   }
2262   /* backward solve the upper triangular */
2263   for (i=n-1; i>=0; i--){
2264     v    = aa + 25*diag[i] + 25;
2265     vi   = aj + diag[i] + 1;
2266     nz   = ai[i+1] - diag[i] - 1;
2267     idt  = 5*i;
2268     s1 = x[idt];  s2 = x[1+idt];
2269     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2270     while (nz--) {
2271       idx   = 5*(*vi++);
2272       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2273       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2274       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2275       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2276       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2277       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2278       v    += 25;
2279     }
2280     v        = aa + 25*diag[i];
2281     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2282     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2283     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2284     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2285     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2286   }
2287 
2288   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2289   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2290   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 #undef __FUNCT__
2295 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2296 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2297 {
2298   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2299   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2300   PetscErrorCode    ierr;
2301   PetscInt          jdx;
2302   const MatScalar   *aa=a->a,*v;
2303   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2304   const PetscScalar *b;
2305 
2306   PetscFunctionBegin;
2307   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2308   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2309   /* forward solve the lower triangular */
2310   idx    = 0;
2311   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2312   for (i=1; i<n; i++) {
2313     v   = aa + 25*ai[i];
2314     vi  = aj + ai[i];
2315     nz  = ai[i+1] - ai[i];
2316     idx = 5*i;
2317     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2318     for(k=0;k<nz;k++) {
2319       jdx   = 5*vi[k];
2320       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2321       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2322       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2323       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2324       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2325       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2326       v    += 25;
2327     }
2328     x[idx]   = s1;
2329     x[1+idx] = s2;
2330     x[2+idx] = s3;
2331     x[3+idx] = s4;
2332     x[4+idx] = s5;
2333   }
2334 
2335   /* backward solve the upper triangular */
2336   for (i=n-1; i>=0; i--){
2337     v   = aa + 25*(adiag[i+1]+1);
2338     vi  = aj + adiag[i+1]+1;
2339     nz  = adiag[i] - adiag[i+1]-1;
2340     idt = 5*i;
2341     s1 = x[idt];  s2 = x[1+idt];
2342     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2343     for(k=0;k<nz;k++){
2344       idx   = 5*vi[k];
2345       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2346       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2347       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2348       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2349       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2350       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2351       v    += 25;
2352     }
2353     /* x = inv_diagonal*x */
2354     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2355     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2356     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2357     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2358     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2359   }
2360 
2361   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2362   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2363   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2364   PetscFunctionReturn(0);
2365 }
2366 
2367 #undef __FUNCT__
2368 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2369 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2370 {
2371   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2372   IS                iscol=a->col,isrow=a->row;
2373   PetscErrorCode    ierr;
2374   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2375   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2376   const MatScalar   *aa=a->a,*v;
2377   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2378   const PetscScalar *b;
2379 
2380   PetscFunctionBegin;
2381   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2382   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2383   t  = a->solve_work;
2384 
2385   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2386   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2387 
2388   /* forward solve the lower triangular */
2389   idx    = 4*(*r++);
2390   t[0] = b[idx];   t[1] = b[1+idx];
2391   t[2] = b[2+idx]; t[3] = b[3+idx];
2392   for (i=1; i<n; i++) {
2393     v     = aa + 16*ai[i];
2394     vi    = aj + ai[i];
2395     nz    = diag[i] - ai[i];
2396     idx   = 4*(*r++);
2397     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2398     while (nz--) {
2399       idx   = 4*(*vi++);
2400       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2401       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2402       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2403       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2404       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2405       v    += 16;
2406     }
2407     idx        = 4*i;
2408     t[idx]   = s1;t[1+idx] = s2;
2409     t[2+idx] = s3;t[3+idx] = s4;
2410   }
2411   /* backward solve the upper triangular */
2412   for (i=n-1; i>=0; i--){
2413     v    = aa + 16*diag[i] + 16;
2414     vi   = aj + diag[i] + 1;
2415     nz   = ai[i+1] - diag[i] - 1;
2416     idt  = 4*i;
2417     s1 = t[idt];  s2 = t[1+idt];
2418     s3 = t[2+idt];s4 = t[3+idt];
2419     while (nz--) {
2420       idx   = 4*(*vi++);
2421       x1    = t[idx];   x2 = t[1+idx];
2422       x3    = t[2+idx]; x4 = t[3+idx];
2423       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2424       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2425       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2426       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2427       v += 16;
2428     }
2429     idc      = 4*(*c--);
2430     v        = aa + 16*diag[i];
2431     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2432     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2433     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2434     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2435   }
2436 
2437   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2438   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2439   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2440   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2441   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2442   PetscFunctionReturn(0);
2443 }
2444 
2445 #undef __FUNCT__
2446 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2447 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2448 {
2449   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2450   IS                iscol=a->col,isrow=a->row;
2451   PetscErrorCode    ierr;
2452   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2453   const PetscInt    *r,*c,*rout,*cout;
2454   const MatScalar   *aa=a->a,*v;
2455   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2456   const PetscScalar *b;
2457 
2458   PetscFunctionBegin;
2459   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2460   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2461   t  = a->solve_work;
2462 
2463   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2464   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2465 
2466   /* forward solve the lower triangular */
2467   idx    = 4*r[0];
2468   t[0] = b[idx];   t[1] = b[1+idx];
2469   t[2] = b[2+idx]; t[3] = b[3+idx];
2470   for (i=1; i<n; i++) {
2471     v     = aa + 16*ai[i];
2472     vi    = aj + ai[i];
2473     nz    = ai[i+1] - ai[i];
2474     idx   = 4*r[i];
2475     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2476     for(m=0;m<nz;m++){
2477       idx   = 4*vi[m];
2478       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2479       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2480       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2481       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2482       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2483       v    += 16;
2484     }
2485     idx        = 4*i;
2486     t[idx]   = s1;t[1+idx] = s2;
2487     t[2+idx] = s3;t[3+idx] = s4;
2488   }
2489   /* backward solve the upper triangular */
2490   for (i=n-1; i>=0; i--){
2491     v    = aa + 16*(adiag[i+1]+1);
2492     vi   = aj + adiag[i+1]+1;
2493     nz   = adiag[i] - adiag[i+1] - 1;
2494     idt  = 4*i;
2495     s1 = t[idt];  s2 = t[1+idt];
2496     s3 = t[2+idt];s4 = t[3+idt];
2497     for(m=0;m<nz;m++){
2498       idx   = 4*vi[m];
2499       x1    = t[idx];   x2 = t[1+idx];
2500       x3    = t[2+idx]; x4 = t[3+idx];
2501       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2502       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2503       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2504       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2505       v += 16;
2506     }
2507     idc      = 4*c[i];
2508     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2509     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2510     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2511     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2512   }
2513 
2514   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2515   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2516   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2518   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2519   PetscFunctionReturn(0);
2520 }
2521 
2522 #undef __FUNCT__
2523 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2524 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2525 {
2526   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2527   IS                iscol=a->col,isrow=a->row;
2528   PetscErrorCode    ierr;
2529   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2530   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2531   const MatScalar   *aa=a->a,*v;
2532   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2533   PetscScalar       *x;
2534   const PetscScalar *b;
2535 
2536   PetscFunctionBegin;
2537   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2538   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2539   t  = (MatScalar *)a->solve_work;
2540 
2541   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2542   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2543 
2544   /* forward solve the lower triangular */
2545   idx    = 4*(*r++);
2546   t[0] = (MatScalar)b[idx];
2547   t[1] = (MatScalar)b[1+idx];
2548   t[2] = (MatScalar)b[2+idx];
2549   t[3] = (MatScalar)b[3+idx];
2550   for (i=1; i<n; i++) {
2551     v     = aa + 16*ai[i];
2552     vi    = aj + ai[i];
2553     nz    = diag[i] - ai[i];
2554     idx   = 4*(*r++);
2555     s1 = (MatScalar)b[idx];
2556     s2 = (MatScalar)b[1+idx];
2557     s3 = (MatScalar)b[2+idx];
2558     s4 = (MatScalar)b[3+idx];
2559     while (nz--) {
2560       idx   = 4*(*vi++);
2561       x1  = t[idx];
2562       x2  = t[1+idx];
2563       x3  = t[2+idx];
2564       x4  = t[3+idx];
2565       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2566       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2567       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2568       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2569       v    += 16;
2570     }
2571     idx        = 4*i;
2572     t[idx]   = s1;
2573     t[1+idx] = s2;
2574     t[2+idx] = s3;
2575     t[3+idx] = s4;
2576   }
2577   /* backward solve the upper triangular */
2578   for (i=n-1; i>=0; i--){
2579     v    = aa + 16*diag[i] + 16;
2580     vi   = aj + diag[i] + 1;
2581     nz   = ai[i+1] - diag[i] - 1;
2582     idt  = 4*i;
2583     s1 = t[idt];
2584     s2 = t[1+idt];
2585     s3 = t[2+idt];
2586     s4 = t[3+idt];
2587     while (nz--) {
2588       idx   = 4*(*vi++);
2589       x1  = t[idx];
2590       x2  = t[1+idx];
2591       x3  = t[2+idx];
2592       x4  = t[3+idx];
2593       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2594       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2595       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2596       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2597       v += 16;
2598     }
2599     idc      = 4*(*c--);
2600     v        = aa + 16*diag[i];
2601     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2602     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2603     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2604     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2605     x[idc]   = (PetscScalar)t[idt];
2606     x[1+idc] = (PetscScalar)t[1+idt];
2607     x[2+idc] = (PetscScalar)t[2+idt];
2608     x[3+idc] = (PetscScalar)t[3+idt];
2609  }
2610 
2611   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2612   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2613   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2614   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2615   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2616   PetscFunctionReturn(0);
2617 }
2618 
2619 #if defined (PETSC_HAVE_SSE)
2620 
2621 #include PETSC_HAVE_SSE
2622 
2623 #undef __FUNCT__
2624 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2625 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2626 {
2627   /*
2628      Note: This code uses demotion of double
2629      to float when performing the mixed-mode computation.
2630      This may not be numerically reasonable for all applications.
2631   */
2632   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2633   IS             iscol=a->col,isrow=a->row;
2634   PetscErrorCode ierr;
2635   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2636   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2637   MatScalar      *aa=a->a,*v;
2638   PetscScalar    *x,*b,*t;
2639 
2640   /* Make space in temp stack for 16 Byte Aligned arrays */
2641   float           ssealignedspace[11],*tmps,*tmpx;
2642   unsigned long   offset;
2643 
2644   PetscFunctionBegin;
2645   SSE_SCOPE_BEGIN;
2646 
2647     offset = (unsigned long)ssealignedspace % 16;
2648     if (offset) offset = (16 - offset)/4;
2649     tmps = &ssealignedspace[offset];
2650     tmpx = &ssealignedspace[offset+4];
2651     PREFETCH_NTA(aa+16*ai[1]);
2652 
2653     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2654     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2655     t  = a->solve_work;
2656 
2657     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2658     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2659 
2660     /* forward solve the lower triangular */
2661     idx  = 4*(*r++);
2662     t[0] = b[idx];   t[1] = b[1+idx];
2663     t[2] = b[2+idx]; t[3] = b[3+idx];
2664     v    =  aa + 16*ai[1];
2665 
2666     for (i=1; i<n;) {
2667       PREFETCH_NTA(&v[8]);
2668       vi   =  aj      + ai[i];
2669       nz   =  diag[i] - ai[i];
2670       idx  =  4*(*r++);
2671 
2672       /* Demote sum from double to float */
2673       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2674       LOAD_PS(tmps,XMM7);
2675 
2676       while (nz--) {
2677         PREFETCH_NTA(&v[16]);
2678         idx = 4*(*vi++);
2679 
2680         /* Demote solution (so far) from double to float */
2681         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2682 
2683         /* 4x4 Matrix-Vector product with negative accumulation: */
2684         SSE_INLINE_BEGIN_2(tmpx,v)
2685           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2686 
2687           /* First Column */
2688           SSE_COPY_PS(XMM0,XMM6)
2689           SSE_SHUFFLE(XMM0,XMM0,0x00)
2690           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2691           SSE_SUB_PS(XMM7,XMM0)
2692 
2693           /* Second Column */
2694           SSE_COPY_PS(XMM1,XMM6)
2695           SSE_SHUFFLE(XMM1,XMM1,0x55)
2696           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2697           SSE_SUB_PS(XMM7,XMM1)
2698 
2699           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2700 
2701           /* Third Column */
2702           SSE_COPY_PS(XMM2,XMM6)
2703           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2704           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2705           SSE_SUB_PS(XMM7,XMM2)
2706 
2707           /* Fourth Column */
2708           SSE_COPY_PS(XMM3,XMM6)
2709           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2710           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2711           SSE_SUB_PS(XMM7,XMM3)
2712         SSE_INLINE_END_2
2713 
2714         v  += 16;
2715       }
2716       idx = 4*i;
2717       v   = aa + 16*ai[++i];
2718       PREFETCH_NTA(v);
2719       STORE_PS(tmps,XMM7);
2720 
2721       /* Promote result from float to double */
2722       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2723     }
2724     /* backward solve the upper triangular */
2725     idt  = 4*(n-1);
2726     ai16 = 16*diag[n-1];
2727     v    = aa + ai16 + 16;
2728     for (i=n-1; i>=0;){
2729       PREFETCH_NTA(&v[8]);
2730       vi = aj + diag[i] + 1;
2731       nz = ai[i+1] - diag[i] - 1;
2732 
2733       /* Demote accumulator from double to float */
2734       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2735       LOAD_PS(tmps,XMM7);
2736 
2737       while (nz--) {
2738         PREFETCH_NTA(&v[16]);
2739         idx = 4*(*vi++);
2740 
2741         /* Demote solution (so far) from double to float */
2742         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2743 
2744         /* 4x4 Matrix-Vector Product with negative accumulation: */
2745         SSE_INLINE_BEGIN_2(tmpx,v)
2746           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2747 
2748           /* First Column */
2749           SSE_COPY_PS(XMM0,XMM6)
2750           SSE_SHUFFLE(XMM0,XMM0,0x00)
2751           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2752           SSE_SUB_PS(XMM7,XMM0)
2753 
2754           /* Second Column */
2755           SSE_COPY_PS(XMM1,XMM6)
2756           SSE_SHUFFLE(XMM1,XMM1,0x55)
2757           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2758           SSE_SUB_PS(XMM7,XMM1)
2759 
2760           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2761 
2762           /* Third Column */
2763           SSE_COPY_PS(XMM2,XMM6)
2764           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2765           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2766           SSE_SUB_PS(XMM7,XMM2)
2767 
2768           /* Fourth Column */
2769           SSE_COPY_PS(XMM3,XMM6)
2770           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2771           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2772           SSE_SUB_PS(XMM7,XMM3)
2773         SSE_INLINE_END_2
2774         v  += 16;
2775       }
2776       v    = aa + ai16;
2777       ai16 = 16*diag[--i];
2778       PREFETCH_NTA(aa+ai16+16);
2779       /*
2780          Scale the result by the diagonal 4x4 block,
2781          which was inverted as part of the factorization
2782       */
2783       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2784         /* First Column */
2785         SSE_COPY_PS(XMM0,XMM7)
2786         SSE_SHUFFLE(XMM0,XMM0,0x00)
2787         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2788 
2789         /* Second Column */
2790         SSE_COPY_PS(XMM1,XMM7)
2791         SSE_SHUFFLE(XMM1,XMM1,0x55)
2792         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2793         SSE_ADD_PS(XMM0,XMM1)
2794 
2795         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2796 
2797         /* Third Column */
2798         SSE_COPY_PS(XMM2,XMM7)
2799         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2800         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2801         SSE_ADD_PS(XMM0,XMM2)
2802 
2803         /* Fourth Column */
2804         SSE_COPY_PS(XMM3,XMM7)
2805         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2806         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2807         SSE_ADD_PS(XMM0,XMM3)
2808 
2809         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2810       SSE_INLINE_END_3
2811 
2812       /* Promote solution from float to double */
2813       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2814 
2815       /* Apply reordering to t and stream into x.    */
2816       /* This way, x doesn't pollute the cache.      */
2817       /* Be careful with size: 2 doubles = 4 floats! */
2818       idc  = 4*(*c--);
2819       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2820         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2821         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2822         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2823         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2824         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2825         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2826       SSE_INLINE_END_2
2827       v    = aa + ai16 + 16;
2828       idt -= 4;
2829     }
2830 
2831     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2832     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2833     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2834     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2835     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2836   SSE_SCOPE_END;
2837   PetscFunctionReturn(0);
2838 }
2839 
2840 #endif
2841 
2842 
2843 /*
2844       Special case where the matrix was ILU(0) factored in the natural
2845    ordering. This eliminates the need for the column and row permutation.
2846 */
2847 #undef __FUNCT__
2848 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2849 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2850 {
2851   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2852   PetscInt          n=a->mbs;
2853   const PetscInt    *ai=a->i,*aj=a->j;
2854   PetscErrorCode    ierr;
2855   const PetscInt    *diag = a->diag;
2856   const MatScalar   *aa=a->a;
2857   PetscScalar       *x;
2858   const PetscScalar *b;
2859 
2860   PetscFunctionBegin;
2861   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2862   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2863 
2864 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2865   {
2866     static PetscScalar w[2000]; /* very BAD need to fix */
2867     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2868   }
2869 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2870   {
2871     static PetscScalar w[2000]; /* very BAD need to fix */
2872     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2873   }
2874 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2875   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2876 #else
2877   {
2878     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2879     const MatScalar *v;
2880     PetscInt        jdx,idt,idx,nz,i,ai16;
2881     const PetscInt  *vi;
2882 
2883   /* forward solve the lower triangular */
2884   idx    = 0;
2885   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2886   for (i=1; i<n; i++) {
2887     v     =  aa      + 16*ai[i];
2888     vi    =  aj      + ai[i];
2889     nz    =  diag[i] - ai[i];
2890     idx   +=  4;
2891     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2892     while (nz--) {
2893       jdx   = 4*(*vi++);
2894       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2895       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2896       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2897       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2898       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2899       v    += 16;
2900     }
2901     x[idx]   = s1;
2902     x[1+idx] = s2;
2903     x[2+idx] = s3;
2904     x[3+idx] = s4;
2905   }
2906   /* backward solve the upper triangular */
2907   idt = 4*(n-1);
2908   for (i=n-1; i>=0; i--){
2909     ai16 = 16*diag[i];
2910     v    = aa + ai16 + 16;
2911     vi   = aj + diag[i] + 1;
2912     nz   = ai[i+1] - diag[i] - 1;
2913     s1 = x[idt];  s2 = x[1+idt];
2914     s3 = x[2+idt];s4 = x[3+idt];
2915     while (nz--) {
2916       idx   = 4*(*vi++);
2917       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2918       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2919       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2920       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2921       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2922       v    += 16;
2923     }
2924     v        = aa + ai16;
2925     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2926     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2927     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2928     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2929     idt -= 4;
2930   }
2931   }
2932 #endif
2933 
2934   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2935   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2936   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2937   PetscFunctionReturn(0);
2938 }
2939 
2940 #undef __FUNCT__
2941 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2942 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2943 {
2944     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2945     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2946     PetscErrorCode    ierr;
2947     PetscInt          idx,jdx,idt;
2948     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2949     const MatScalar   *aa=a->a,*v;
2950     PetscScalar       *x;
2951     const PetscScalar *b;
2952     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2953 
2954     PetscFunctionBegin;
2955     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2956     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2957     /* forward solve the lower triangular */
2958     idx    = 0;
2959     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2960     for (i=1; i<n; i++) {
2961        v    = aa + bs2*ai[i];
2962        vi   = aj + ai[i];
2963        nz   = ai[i+1] - ai[i];
2964       idx   = bs*i;
2965        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2966       for(k=0;k<nz;k++) {
2967           jdx   = bs*vi[k];
2968           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2969           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2970           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2971           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2972 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2973 
2974           v   +=  bs2;
2975         }
2976 
2977        x[idx]   = s1;
2978        x[1+idx] = s2;
2979        x[2+idx] = s3;
2980        x[3+idx] = s4;
2981     }
2982 
2983    /* backward solve the upper triangular */
2984   for (i=n-1; i>=0; i--){
2985     v   = aa + bs2*(adiag[i+1]+1);
2986      vi  = aj + adiag[i+1]+1;
2987      nz  = adiag[i] - adiag[i+1]-1;
2988      idt = bs*i;
2989      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2990 
2991     for(k=0;k<nz;k++){
2992       idx   = bs*vi[k];
2993        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2994        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2995        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2996        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2997        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2998 
2999         v   +=  bs2;
3000     }
3001     /* x = inv_diagonal*x */
3002    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3003    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3004    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3005    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3006 
3007   }
3008 
3009   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3010   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3011   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3012   PetscFunctionReturn(0);
3013 }
3014 
3015 #undef __FUNCT__
3016 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3017 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3018 {
3019   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3020   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3021   PetscErrorCode ierr;
3022   PetscInt       *diag = a->diag;
3023   MatScalar      *aa=a->a;
3024   PetscScalar    *x,*b;
3025 
3026   PetscFunctionBegin;
3027   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3028   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3029 
3030   {
3031     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3032     MatScalar  *v,*t=(MatScalar *)x;
3033     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3034 
3035     /* forward solve the lower triangular */
3036     idx  = 0;
3037     t[0] = (MatScalar)b[0];
3038     t[1] = (MatScalar)b[1];
3039     t[2] = (MatScalar)b[2];
3040     t[3] = (MatScalar)b[3];
3041     for (i=1; i<n; i++) {
3042       v     =  aa      + 16*ai[i];
3043       vi    =  aj      + ai[i];
3044       nz    =  diag[i] - ai[i];
3045       idx   +=  4;
3046       s1 = (MatScalar)b[idx];
3047       s2 = (MatScalar)b[1+idx];
3048       s3 = (MatScalar)b[2+idx];
3049       s4 = (MatScalar)b[3+idx];
3050       while (nz--) {
3051         jdx = 4*(*vi++);
3052         x1  = t[jdx];
3053         x2  = t[1+jdx];
3054         x3  = t[2+jdx];
3055         x4  = t[3+jdx];
3056         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3057         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3058         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3059         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3060         v    += 16;
3061       }
3062       t[idx]   = s1;
3063       t[1+idx] = s2;
3064       t[2+idx] = s3;
3065       t[3+idx] = s4;
3066     }
3067     /* backward solve the upper triangular */
3068     idt = 4*(n-1);
3069     for (i=n-1; i>=0; i--){
3070       ai16 = 16*diag[i];
3071       v    = aa + ai16 + 16;
3072       vi   = aj + diag[i] + 1;
3073       nz   = ai[i+1] - diag[i] - 1;
3074       s1   = t[idt];
3075       s2   = t[1+idt];
3076       s3   = t[2+idt];
3077       s4   = t[3+idt];
3078       while (nz--) {
3079         idx = 4*(*vi++);
3080         x1  = (MatScalar)x[idx];
3081         x2  = (MatScalar)x[1+idx];
3082         x3  = (MatScalar)x[2+idx];
3083         x4  = (MatScalar)x[3+idx];
3084         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3085         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3086         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3087         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3088         v    += 16;
3089       }
3090       v        = aa + ai16;
3091       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3092       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3093       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3094       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3095       idt -= 4;
3096     }
3097   }
3098 
3099   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3100   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3101   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3102   PetscFunctionReturn(0);
3103 }
3104 
3105 #if defined (PETSC_HAVE_SSE)
3106 
3107 #include PETSC_HAVE_SSE
3108 #undef __FUNCT__
3109 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3110 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3111 {
3112   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3113   unsigned short *aj=(unsigned short *)a->j;
3114   PetscErrorCode ierr;
3115   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3116   MatScalar      *aa=a->a;
3117   PetscScalar    *x,*b;
3118 
3119   PetscFunctionBegin;
3120   SSE_SCOPE_BEGIN;
3121   /*
3122      Note: This code currently uses demotion of double
3123      to float when performing the mixed-mode computation.
3124      This may not be numerically reasonable for all applications.
3125   */
3126   PREFETCH_NTA(aa+16*ai[1]);
3127 
3128   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3130   {
3131     /* x will first be computed in single precision then promoted inplace to double */
3132     MatScalar      *v,*t=(MatScalar *)x;
3133     int            nz,i,idt,ai16;
3134     unsigned int   jdx,idx;
3135     unsigned short *vi;
3136     /* Forward solve the lower triangular factor. */
3137 
3138     /* First block is the identity. */
3139     idx  = 0;
3140     CONVERT_DOUBLE4_FLOAT4(t,b);
3141     v    =  aa + 16*((unsigned int)ai[1]);
3142 
3143     for (i=1; i<n;) {
3144       PREFETCH_NTA(&v[8]);
3145       vi   =  aj      + ai[i];
3146       nz   =  diag[i] - ai[i];
3147       idx +=  4;
3148 
3149       /* Demote RHS from double to float. */
3150       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3151       LOAD_PS(&t[idx],XMM7);
3152 
3153       while (nz--) {
3154         PREFETCH_NTA(&v[16]);
3155         jdx = 4*((unsigned int)(*vi++));
3156 
3157         /* 4x4 Matrix-Vector product with negative accumulation: */
3158         SSE_INLINE_BEGIN_2(&t[jdx],v)
3159           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3160 
3161           /* First Column */
3162           SSE_COPY_PS(XMM0,XMM6)
3163           SSE_SHUFFLE(XMM0,XMM0,0x00)
3164           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3165           SSE_SUB_PS(XMM7,XMM0)
3166 
3167           /* Second Column */
3168           SSE_COPY_PS(XMM1,XMM6)
3169           SSE_SHUFFLE(XMM1,XMM1,0x55)
3170           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3171           SSE_SUB_PS(XMM7,XMM1)
3172 
3173           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3174 
3175           /* Third Column */
3176           SSE_COPY_PS(XMM2,XMM6)
3177           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3178           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3179           SSE_SUB_PS(XMM7,XMM2)
3180 
3181           /* Fourth Column */
3182           SSE_COPY_PS(XMM3,XMM6)
3183           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3184           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3185           SSE_SUB_PS(XMM7,XMM3)
3186         SSE_INLINE_END_2
3187 
3188         v  += 16;
3189       }
3190       v    =  aa + 16*ai[++i];
3191       PREFETCH_NTA(v);
3192       STORE_PS(&t[idx],XMM7);
3193     }
3194 
3195     /* Backward solve the upper triangular factor.*/
3196 
3197     idt  = 4*(n-1);
3198     ai16 = 16*diag[n-1];
3199     v    = aa + ai16 + 16;
3200     for (i=n-1; i>=0;){
3201       PREFETCH_NTA(&v[8]);
3202       vi = aj + diag[i] + 1;
3203       nz = ai[i+1] - diag[i] - 1;
3204 
3205       LOAD_PS(&t[idt],XMM7);
3206 
3207       while (nz--) {
3208         PREFETCH_NTA(&v[16]);
3209         idx = 4*((unsigned int)(*vi++));
3210 
3211         /* 4x4 Matrix-Vector Product with negative accumulation: */
3212         SSE_INLINE_BEGIN_2(&t[idx],v)
3213           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3214 
3215           /* First Column */
3216           SSE_COPY_PS(XMM0,XMM6)
3217           SSE_SHUFFLE(XMM0,XMM0,0x00)
3218           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3219           SSE_SUB_PS(XMM7,XMM0)
3220 
3221           /* Second Column */
3222           SSE_COPY_PS(XMM1,XMM6)
3223           SSE_SHUFFLE(XMM1,XMM1,0x55)
3224           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3225           SSE_SUB_PS(XMM7,XMM1)
3226 
3227           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3228 
3229           /* Third Column */
3230           SSE_COPY_PS(XMM2,XMM6)
3231           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3232           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3233           SSE_SUB_PS(XMM7,XMM2)
3234 
3235           /* Fourth Column */
3236           SSE_COPY_PS(XMM3,XMM6)
3237           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3238           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3239           SSE_SUB_PS(XMM7,XMM3)
3240         SSE_INLINE_END_2
3241         v  += 16;
3242       }
3243       v    = aa + ai16;
3244       ai16 = 16*diag[--i];
3245       PREFETCH_NTA(aa+ai16+16);
3246       /*
3247          Scale the result by the diagonal 4x4 block,
3248          which was inverted as part of the factorization
3249       */
3250       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3251         /* First Column */
3252         SSE_COPY_PS(XMM0,XMM7)
3253         SSE_SHUFFLE(XMM0,XMM0,0x00)
3254         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3255 
3256         /* Second Column */
3257         SSE_COPY_PS(XMM1,XMM7)
3258         SSE_SHUFFLE(XMM1,XMM1,0x55)
3259         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3260         SSE_ADD_PS(XMM0,XMM1)
3261 
3262         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3263 
3264         /* Third Column */
3265         SSE_COPY_PS(XMM2,XMM7)
3266         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3267         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3268         SSE_ADD_PS(XMM0,XMM2)
3269 
3270         /* Fourth Column */
3271         SSE_COPY_PS(XMM3,XMM7)
3272         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3273         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3274         SSE_ADD_PS(XMM0,XMM3)
3275 
3276         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3277       SSE_INLINE_END_3
3278 
3279       v    = aa + ai16 + 16;
3280       idt -= 4;
3281     }
3282 
3283     /* Convert t from single precision back to double precision (inplace)*/
3284     idt = 4*(n-1);
3285     for (i=n-1;i>=0;i--) {
3286       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3287       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3288       PetscScalar *xtemp=&x[idt];
3289       MatScalar   *ttemp=&t[idt];
3290       xtemp[3] = (PetscScalar)ttemp[3];
3291       xtemp[2] = (PetscScalar)ttemp[2];
3292       xtemp[1] = (PetscScalar)ttemp[1];
3293       xtemp[0] = (PetscScalar)ttemp[0];
3294       idt -= 4;
3295     }
3296 
3297   } /* End of artificial scope. */
3298   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3299   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3300   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3301   SSE_SCOPE_END;
3302   PetscFunctionReturn(0);
3303 }
3304 
3305 #undef __FUNCT__
3306 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3307 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3308 {
3309   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3310   int            *aj=a->j;
3311   PetscErrorCode ierr;
3312   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3313   MatScalar      *aa=a->a;
3314   PetscScalar    *x,*b;
3315 
3316   PetscFunctionBegin;
3317   SSE_SCOPE_BEGIN;
3318   /*
3319      Note: This code currently uses demotion of double
3320      to float when performing the mixed-mode computation.
3321      This may not be numerically reasonable for all applications.
3322   */
3323   PREFETCH_NTA(aa+16*ai[1]);
3324 
3325   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3326   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3327   {
3328     /* x will first be computed in single precision then promoted inplace to double */
3329     MatScalar *v,*t=(MatScalar *)x;
3330     int       nz,i,idt,ai16;
3331     int       jdx,idx;
3332     int       *vi;
3333     /* Forward solve the lower triangular factor. */
3334 
3335     /* First block is the identity. */
3336     idx  = 0;
3337     CONVERT_DOUBLE4_FLOAT4(t,b);
3338     v    =  aa + 16*ai[1];
3339 
3340     for (i=1; i<n;) {
3341       PREFETCH_NTA(&v[8]);
3342       vi   =  aj      + ai[i];
3343       nz   =  diag[i] - ai[i];
3344       idx +=  4;
3345 
3346       /* Demote RHS from double to float. */
3347       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3348       LOAD_PS(&t[idx],XMM7);
3349 
3350       while (nz--) {
3351         PREFETCH_NTA(&v[16]);
3352         jdx = 4*(*vi++);
3353 /*          jdx = *vi++; */
3354 
3355         /* 4x4 Matrix-Vector product with negative accumulation: */
3356         SSE_INLINE_BEGIN_2(&t[jdx],v)
3357           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3358 
3359           /* First Column */
3360           SSE_COPY_PS(XMM0,XMM6)
3361           SSE_SHUFFLE(XMM0,XMM0,0x00)
3362           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3363           SSE_SUB_PS(XMM7,XMM0)
3364 
3365           /* Second Column */
3366           SSE_COPY_PS(XMM1,XMM6)
3367           SSE_SHUFFLE(XMM1,XMM1,0x55)
3368           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3369           SSE_SUB_PS(XMM7,XMM1)
3370 
3371           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3372 
3373           /* Third Column */
3374           SSE_COPY_PS(XMM2,XMM6)
3375           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3376           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3377           SSE_SUB_PS(XMM7,XMM2)
3378 
3379           /* Fourth Column */
3380           SSE_COPY_PS(XMM3,XMM6)
3381           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3382           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3383           SSE_SUB_PS(XMM7,XMM3)
3384         SSE_INLINE_END_2
3385 
3386         v  += 16;
3387       }
3388       v    =  aa + 16*ai[++i];
3389       PREFETCH_NTA(v);
3390       STORE_PS(&t[idx],XMM7);
3391     }
3392 
3393     /* Backward solve the upper triangular factor.*/
3394 
3395     idt  = 4*(n-1);
3396     ai16 = 16*diag[n-1];
3397     v    = aa + ai16 + 16;
3398     for (i=n-1; i>=0;){
3399       PREFETCH_NTA(&v[8]);
3400       vi = aj + diag[i] + 1;
3401       nz = ai[i+1] - diag[i] - 1;
3402 
3403       LOAD_PS(&t[idt],XMM7);
3404 
3405       while (nz--) {
3406         PREFETCH_NTA(&v[16]);
3407         idx = 4*(*vi++);
3408 /*          idx = *vi++; */
3409 
3410         /* 4x4 Matrix-Vector Product with negative accumulation: */
3411         SSE_INLINE_BEGIN_2(&t[idx],v)
3412           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3413 
3414           /* First Column */
3415           SSE_COPY_PS(XMM0,XMM6)
3416           SSE_SHUFFLE(XMM0,XMM0,0x00)
3417           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3418           SSE_SUB_PS(XMM7,XMM0)
3419 
3420           /* Second Column */
3421           SSE_COPY_PS(XMM1,XMM6)
3422           SSE_SHUFFLE(XMM1,XMM1,0x55)
3423           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3424           SSE_SUB_PS(XMM7,XMM1)
3425 
3426           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3427 
3428           /* Third Column */
3429           SSE_COPY_PS(XMM2,XMM6)
3430           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3431           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3432           SSE_SUB_PS(XMM7,XMM2)
3433 
3434           /* Fourth Column */
3435           SSE_COPY_PS(XMM3,XMM6)
3436           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3437           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3438           SSE_SUB_PS(XMM7,XMM3)
3439         SSE_INLINE_END_2
3440         v  += 16;
3441       }
3442       v    = aa + ai16;
3443       ai16 = 16*diag[--i];
3444       PREFETCH_NTA(aa+ai16+16);
3445       /*
3446          Scale the result by the diagonal 4x4 block,
3447          which was inverted as part of the factorization
3448       */
3449       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3450         /* First Column */
3451         SSE_COPY_PS(XMM0,XMM7)
3452         SSE_SHUFFLE(XMM0,XMM0,0x00)
3453         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3454 
3455         /* Second Column */
3456         SSE_COPY_PS(XMM1,XMM7)
3457         SSE_SHUFFLE(XMM1,XMM1,0x55)
3458         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3459         SSE_ADD_PS(XMM0,XMM1)
3460 
3461         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3462 
3463         /* Third Column */
3464         SSE_COPY_PS(XMM2,XMM7)
3465         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3466         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3467         SSE_ADD_PS(XMM0,XMM2)
3468 
3469         /* Fourth Column */
3470         SSE_COPY_PS(XMM3,XMM7)
3471         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3472         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3473         SSE_ADD_PS(XMM0,XMM3)
3474 
3475         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3476       SSE_INLINE_END_3
3477 
3478       v    = aa + ai16 + 16;
3479       idt -= 4;
3480     }
3481 
3482     /* Convert t from single precision back to double precision (inplace)*/
3483     idt = 4*(n-1);
3484     for (i=n-1;i>=0;i--) {
3485       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3486       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3487       PetscScalar *xtemp=&x[idt];
3488       MatScalar   *ttemp=&t[idt];
3489       xtemp[3] = (PetscScalar)ttemp[3];
3490       xtemp[2] = (PetscScalar)ttemp[2];
3491       xtemp[1] = (PetscScalar)ttemp[1];
3492       xtemp[0] = (PetscScalar)ttemp[0];
3493       idt -= 4;
3494     }
3495 
3496   } /* End of artificial scope. */
3497   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3498   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3499   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3500   SSE_SCOPE_END;
3501   PetscFunctionReturn(0);
3502 }
3503 
3504 #endif
3505 
3506 #undef __FUNCT__
3507 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3508 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3509 {
3510   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3511   IS                iscol=a->col,isrow=a->row;
3512   PetscErrorCode    ierr;
3513   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3514   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3515   const MatScalar   *aa=a->a,*v;
3516   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3517   const PetscScalar *b;
3518 
3519   PetscFunctionBegin;
3520   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3521   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3522   t  = a->solve_work;
3523 
3524   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3525   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3526 
3527   /* forward solve the lower triangular */
3528   idx    = 3*(*r++);
3529   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3530   for (i=1; i<n; i++) {
3531     v     = aa + 9*ai[i];
3532     vi    = aj + ai[i];
3533     nz    = diag[i] - ai[i];
3534     idx   = 3*(*r++);
3535     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3536     while (nz--) {
3537       idx   = 3*(*vi++);
3538       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3539       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3540       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3541       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3542       v += 9;
3543     }
3544     idx = 3*i;
3545     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3546   }
3547   /* backward solve the upper triangular */
3548   for (i=n-1; i>=0; i--){
3549     v    = aa + 9*diag[i] + 9;
3550     vi   = aj + diag[i] + 1;
3551     nz   = ai[i+1] - diag[i] - 1;
3552     idt  = 3*i;
3553     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3554     while (nz--) {
3555       idx   = 3*(*vi++);
3556       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3557       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3558       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3559       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3560       v += 9;
3561     }
3562     idc = 3*(*c--);
3563     v   = aa + 9*diag[i];
3564     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3565     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3566     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3567   }
3568   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3569   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3570   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3571   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3572   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3573   PetscFunctionReturn(0);
3574 }
3575 
3576 #undef __FUNCT__
3577 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3578 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
3579 {
3580   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3581   IS                iscol=a->col,isrow=a->row;
3582   PetscErrorCode    ierr;
3583   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3584   const PetscInt    *r,*c,*rout,*cout;
3585   const MatScalar   *aa=a->a,*v;
3586   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3587   const PetscScalar *b;
3588 
3589   PetscFunctionBegin;
3590   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3592   t  = a->solve_work;
3593 
3594   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3595   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3596 
3597   /* forward solve the lower triangular */
3598   idx    = 3*r[0];
3599   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3600   for (i=1; i<n; i++) {
3601     v     = aa + 9*ai[i];
3602     vi    = aj + ai[i];
3603     nz    = ai[i+1] - ai[i];
3604     idx   = 3*r[i];
3605     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3606     for(m=0;m<nz;m++){
3607       idx   = 3*vi[m];
3608       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3609       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3610       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3611       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3612       v += 9;
3613     }
3614     idx = 3*i;
3615     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3616   }
3617   /* backward solve the upper triangular */
3618   for (i=n-1; i>=0; i--){
3619     v    = aa + 9*(adiag[i+1]+1);
3620     vi   = aj + adiag[i+1]+1;
3621     nz   = adiag[i] - adiag[i+1] - 1;
3622     idt  = 3*i;
3623     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3624     for(m=0;m<nz;m++){
3625       idx   = 3*vi[m];
3626       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3627       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3628       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3629       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3630       v += 9;
3631     }
3632     idc = 3*c[i];
3633     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3634     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3635     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3636   }
3637   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3638   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3639   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3641   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3642   PetscFunctionReturn(0);
3643 }
3644 
3645 /*
3646       Special case where the matrix was ILU(0) factored in the natural
3647    ordering. This eliminates the need for the column and row permutation.
3648 */
3649 #undef __FUNCT__
3650 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3651 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
3652 {
3653   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3654   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3655   PetscErrorCode    ierr;
3656   PetscInt          *diag = a->diag;
3657   const MatScalar   *aa=a->a,*v;
3658   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3659   const PetscScalar *b;
3660   PetscInt          jdx,idt,idx,nz,*vi,i;
3661 
3662   PetscFunctionBegin;
3663   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3664   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3665 
3666   /* forward solve the lower triangular */
3667   idx    = 0;
3668   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
3669   for (i=1; i<n; i++) {
3670     v     =  aa      + 9*ai[i];
3671     vi    =  aj      + ai[i];
3672     nz    =  diag[i] - ai[i];
3673     idx   +=  3;
3674     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
3675     while (nz--) {
3676       jdx   = 3*(*vi++);
3677       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3678       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3679       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3680       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3681       v    += 9;
3682     }
3683     x[idx]   = s1;
3684     x[1+idx] = s2;
3685     x[2+idx] = s3;
3686   }
3687   /* backward solve the upper triangular */
3688   for (i=n-1; i>=0; i--){
3689     v    = aa + 9*diag[i] + 9;
3690     vi   = aj + diag[i] + 1;
3691     nz   = ai[i+1] - diag[i] - 1;
3692     idt  = 3*i;
3693     s1 = x[idt];  s2 = x[1+idt];
3694     s3 = x[2+idt];
3695     while (nz--) {
3696       idx   = 3*(*vi++);
3697       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3698       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3699       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3700       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3701       v    += 9;
3702     }
3703     v        = aa +  9*diag[i];
3704     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3705     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3706     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3707   }
3708 
3709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3711   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3712   PetscFunctionReturn(0);
3713 }
3714 
3715 #undef __FUNCT__
3716 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3717 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3718 {
3719     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3721     PetscErrorCode    ierr;
3722     PetscInt          idx,jdx,idt;
3723     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3724     const MatScalar   *aa=a->a,*v;
3725     PetscScalar       *x;
3726     const PetscScalar *b;
3727     PetscScalar        s1,s2,s3,x1,x2,x3;
3728 
3729     PetscFunctionBegin;
3730     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3731     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3732     /* forward solve the lower triangular */
3733     idx    = 0;
3734     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3735     for (i=1; i<n; i++) {
3736        v    = aa + bs2*ai[i];
3737        vi   = aj + ai[i];
3738        nz   = ai[i+1] - ai[i];
3739       idx   = bs*i;
3740        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3741       for(k=0;k<nz;k++){
3742          jdx   = bs*vi[k];
3743           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3744           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3745           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3746           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3747 
3748           v   +=  bs2;
3749         }
3750 
3751        x[idx]   = s1;
3752        x[1+idx] = s2;
3753        x[2+idx] = s3;
3754     }
3755 
3756    /* backward solve the upper triangular */
3757   for (i=n-1; i>=0; i--){
3758     v   = aa + bs2*(adiag[i+1]+1);
3759      vi  = aj + adiag[i+1]+1;
3760      nz  = adiag[i] - adiag[i+1]-1;
3761      idt = bs*i;
3762      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3763 
3764      for(k=0;k<nz;k++){
3765        idx   = bs*vi[k];
3766        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3767        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3768        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3769        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3770 
3771         v   +=  bs2;
3772     }
3773     /* x = inv_diagonal*x */
3774    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3775    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3776    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3777 
3778   }
3779 
3780   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3781   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3782   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3783   PetscFunctionReturn(0);
3784 }
3785 
3786 #undef __FUNCT__
3787 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3788 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
3789 {
3790   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3791   IS                iscol=a->col,isrow=a->row;
3792   PetscErrorCode    ierr;
3793   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3794   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3795   const MatScalar   *aa=a->a,*v;
3796   PetscScalar       *x,s1,s2,x1,x2,*t;
3797   const PetscScalar *b;
3798 
3799   PetscFunctionBegin;
3800   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3801   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3802   t  = a->solve_work;
3803 
3804   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3805   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3806 
3807   /* forward solve the lower triangular */
3808   idx    = 2*(*r++);
3809   t[0] = b[idx]; t[1] = b[1+idx];
3810   for (i=1; i<n; i++) {
3811     v     = aa + 4*ai[i];
3812     vi    = aj + ai[i];
3813     nz    = diag[i] - ai[i];
3814     idx   = 2*(*r++);
3815     s1  = b[idx]; s2 = b[1+idx];
3816     while (nz--) {
3817       idx   = 2*(*vi++);
3818       x1    = t[idx]; x2 = t[1+idx];
3819       s1 -= v[0]*x1 + v[2]*x2;
3820       s2 -= v[1]*x1 + v[3]*x2;
3821       v += 4;
3822     }
3823     idx = 2*i;
3824     t[idx] = s1; t[1+idx] = s2;
3825   }
3826   /* backward solve the upper triangular */
3827   for (i=n-1; i>=0; i--){
3828     v    = aa + 4*diag[i] + 4;
3829     vi   = aj + diag[i] + 1;
3830     nz   = ai[i+1] - diag[i] - 1;
3831     idt  = 2*i;
3832     s1 = t[idt]; s2 = t[1+idt];
3833     while (nz--) {
3834       idx   = 2*(*vi++);
3835       x1    = t[idx]; x2 = t[1+idx];
3836       s1 -= v[0]*x1 + v[2]*x2;
3837       s2 -= v[1]*x1 + v[3]*x2;
3838       v += 4;
3839     }
3840     idc = 2*(*c--);
3841     v   = aa + 4*diag[i];
3842     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3843     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3844   }
3845   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3846   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3847   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3848   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3849   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3850   PetscFunctionReturn(0);
3851 }
3852 
3853 #undef __FUNCT__
3854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
3855 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
3856 {
3857   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3858   IS                iscol=a->col,isrow=a->row;
3859   PetscErrorCode    ierr;
3860   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
3861   const PetscInt    *r,*c,*rout,*cout;
3862   const MatScalar   *aa=a->a,*v;
3863   PetscScalar       *x,s1,s2,x1,x2,*t;
3864   const PetscScalar *b;
3865 
3866   PetscFunctionBegin;
3867   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3868   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3869   t  = a->solve_work;
3870 
3871   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3872   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3873 
3874   /* forward solve the lower triangular */
3875   idx    = 2*r[0];
3876   t[0] = b[idx]; t[1] = b[1+idx];
3877   for (i=1; i<n; i++) {
3878     v     = aa + 4*ai[i];
3879     vi    = aj + ai[i];
3880     nz    = ai[i+1] - ai[i];
3881     idx   = 2*r[i];
3882     s1  = b[idx]; s2 = b[1+idx];
3883     for(m=0;m<nz;m++){
3884       jdx   = 2*vi[m];
3885       x1    = t[jdx]; x2 = t[1+jdx];
3886       s1 -= v[0]*x1 + v[2]*x2;
3887       s2 -= v[1]*x1 + v[3]*x2;
3888       v += 4;
3889     }
3890     idx = 2*i;
3891     t[idx] = s1; t[1+idx] = s2;
3892   }
3893   /* backward solve the upper triangular */
3894   for (i=n-1; i>=0; i--){
3895     v    = aa + 4*(adiag[i+1]+1);
3896     vi   = aj + adiag[i+1]+1;
3897     nz   = adiag[i] - adiag[i+1] - 1;
3898     idt  = 2*i;
3899     s1 = t[idt]; s2 = t[1+idt];
3900     for(m=0;m<nz;m++){
3901       idx   = 2*vi[m];
3902       x1    = t[idx]; x2 = t[1+idx];
3903       s1 -= v[0]*x1 + v[2]*x2;
3904       s2 -= v[1]*x1 + v[3]*x2;
3905       v += 4;
3906     }
3907     idc = 2*c[i];
3908     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3909     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3910   }
3911   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3912   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3913   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3914   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3915   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3916   PetscFunctionReturn(0);
3917 }
3918 
3919 /*
3920       Special case where the matrix was ILU(0) factored in the natural
3921    ordering. This eliminates the need for the column and row permutation.
3922 */
3923 #undef __FUNCT__
3924 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3925 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
3926 {
3927   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3928   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3929   PetscErrorCode    ierr;
3930   PetscInt          *diag = a->diag;
3931   const MatScalar   *aa=a->a,*v;
3932   PetscScalar       *x,s1,s2,x1,x2;
3933   const PetscScalar *b;
3934   PetscInt          jdx,idt,idx,nz,*vi,i;
3935 
3936   PetscFunctionBegin;
3937   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3938   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3939 
3940   /* forward solve the lower triangular */
3941   idx    = 0;
3942   x[0]   = b[0]; x[1] = b[1];
3943   for (i=1; i<n; i++) {
3944     v     =  aa      + 4*ai[i];
3945     vi    =  aj      + ai[i];
3946     nz    =  diag[i] - ai[i];
3947     idx   +=  2;
3948     s1  =  b[idx];s2 = b[1+idx];
3949     while (nz--) {
3950       jdx   = 2*(*vi++);
3951       x1    = x[jdx];x2 = x[1+jdx];
3952       s1 -= v[0]*x1 + v[2]*x2;
3953       s2 -= v[1]*x1 + v[3]*x2;
3954       v    += 4;
3955     }
3956     x[idx]   = s1;
3957     x[1+idx] = s2;
3958   }
3959   /* backward solve the upper triangular */
3960   for (i=n-1; i>=0; i--){
3961     v    = aa + 4*diag[i] + 4;
3962     vi   = aj + diag[i] + 1;
3963     nz   = ai[i+1] - diag[i] - 1;
3964     idt  = 2*i;
3965     s1 = x[idt];  s2 = x[1+idt];
3966     while (nz--) {
3967       idx   = 2*(*vi++);
3968       x1    = x[idx];   x2 = x[1+idx];
3969       s1 -= v[0]*x1 + v[2]*x2;
3970       s2 -= v[1]*x1 + v[3]*x2;
3971       v    += 4;
3972     }
3973     v        = aa +  4*diag[i];
3974     x[idt]   = v[0]*s1 + v[2]*s2;
3975     x[1+idt] = v[1]*s1 + v[3]*s2;
3976   }
3977 
3978   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3979   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3980   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3981   PetscFunctionReturn(0);
3982 }
3983 
3984 #undef __FUNCT__
3985 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3986 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3987 {
3988     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3989     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3990     PetscErrorCode    ierr;
3991     PetscInt          jdx;
3992     const MatScalar   *aa=a->a,*v;
3993     PetscScalar       *x,s1,s2,x1,x2;
3994     const PetscScalar *b;
3995 
3996     PetscFunctionBegin;
3997     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3998     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3999     /* forward solve the lower triangular */
4000     idx    = 0;
4001     x[0] = b[idx]; x[1] = b[1+idx];
4002     for (i=1; i<n; i++) {
4003         v   = aa + 4*ai[i];
4004        vi   = aj + ai[i];
4005        nz   = ai[i+1] - ai[i];
4006        idx  = 2*i;
4007        s1   = b[idx];s2 = b[1+idx];
4008       for(k=0;k<nz;k++){
4009          jdx   = 2*vi[k];
4010           x1    = x[jdx];x2 = x[1+jdx];
4011           s1   -= v[0]*x1 + v[2]*x2;
4012           s2   -= v[1]*x1 + v[3]*x2;
4013            v   +=  4;
4014         }
4015        x[idx]   = s1;
4016        x[1+idx] = s2;
4017     }
4018 
4019    /* backward solve the upper triangular */
4020   for (i=n-1; i>=0; i--){
4021      v   = aa + 4*(adiag[i+1]+1);
4022      vi  = aj + adiag[i+1]+1;
4023      nz  = adiag[i] - adiag[i+1]-1;
4024      idt = 2*i;
4025      s1 = x[idt];  s2 = x[1+idt];
4026      for(k=0;k<nz;k++){
4027       idx   = 2*vi[k];
4028        x1    = x[idx];   x2 = x[1+idx];
4029        s1 -= v[0]*x1 + v[2]*x2;
4030        s2 -= v[1]*x1 + v[3]*x2;
4031          v    += 4;
4032     }
4033     /* x = inv_diagonal*x */
4034    x[idt]   = v[0]*s1 + v[2]*s2;
4035    x[1+idt] = v[1]*s1 + v[3]*s2;
4036   }
4037 
4038   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4039   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4040   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4041   PetscFunctionReturn(0);
4042 }
4043 
4044 #undef __FUNCT__
4045 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4046 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4047 {
4048   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4049   IS             iscol=a->col,isrow=a->row;
4050   PetscErrorCode ierr;
4051   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4052   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4053   MatScalar      *aa=a->a,*v;
4054   PetscScalar    *x,*b,s1,*t;
4055 
4056   PetscFunctionBegin;
4057   if (!n) PetscFunctionReturn(0);
4058 
4059   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4060   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4061   t  = a->solve_work;
4062 
4063   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4064   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4065 
4066   /* forward solve the lower triangular */
4067   t[0] = b[*r++];
4068   for (i=1; i<n; i++) {
4069     v     = aa + ai[i];
4070     vi    = aj + ai[i];
4071     nz    = diag[i] - ai[i];
4072     s1  = b[*r++];
4073     while (nz--) {
4074       s1 -= (*v++)*t[*vi++];
4075     }
4076     t[i] = s1;
4077   }
4078   /* backward solve the upper triangular */
4079   for (i=n-1; i>=0; i--){
4080     v    = aa + diag[i] + 1;
4081     vi   = aj + diag[i] + 1;
4082     nz   = ai[i+1] - diag[i] - 1;
4083     s1 = t[i];
4084     while (nz--) {
4085       s1 -= (*v++)*t[*vi++];
4086     }
4087     x[*c--] = t[i] = aa[diag[i]]*s1;
4088   }
4089 
4090   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4091   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4092   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4093   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4094   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4095   PetscFunctionReturn(0);
4096 }
4097 /*
4098       Special case where the matrix was ILU(0) factored in the natural
4099    ordering. This eliminates the need for the column and row permutation.
4100 */
4101 #undef __FUNCT__
4102 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4103 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4104 {
4105   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4106   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4107   PetscErrorCode ierr;
4108   PetscInt       *diag = a->diag;
4109   MatScalar      *aa=a->a;
4110   PetscScalar    *x,*b;
4111   PetscScalar    s1,x1;
4112   MatScalar      *v;
4113   PetscInt       jdx,idt,idx,nz,*vi,i;
4114 
4115   PetscFunctionBegin;
4116   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4117   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4118 
4119   /* forward solve the lower triangular */
4120   idx    = 0;
4121   x[0]   = b[0];
4122   for (i=1; i<n; i++) {
4123     v     =  aa      + ai[i];
4124     vi    =  aj      + ai[i];
4125     nz    =  diag[i] - ai[i];
4126     idx   +=  1;
4127     s1  =  b[idx];
4128     while (nz--) {
4129       jdx   = *vi++;
4130       x1    = x[jdx];
4131       s1 -= v[0]*x1;
4132       v    += 1;
4133     }
4134     x[idx]   = s1;
4135   }
4136   /* backward solve the upper triangular */
4137   for (i=n-1; i>=0; i--){
4138     v    = aa + diag[i] + 1;
4139     vi   = aj + diag[i] + 1;
4140     nz   = ai[i+1] - diag[i] - 1;
4141     idt  = i;
4142     s1 = x[idt];
4143     while (nz--) {
4144       idx   = *vi++;
4145       x1    = x[idx];
4146       s1 -= v[0]*x1;
4147       v    += 1;
4148     }
4149     v        = aa +  diag[i];
4150     x[idt]   = v[0]*s1;
4151   }
4152   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4153   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4154   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4155   PetscFunctionReturn(0);
4156 }
4157 
4158 /* ----------------------------------------------------------------*/
4159 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4160 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4161 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
4162 
4163 #undef __FUNCT__
4164 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4165 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4166 {
4167   Mat            C=B;
4168   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4169   IS             isrow = b->row,isicol = b->icol;
4170   PetscErrorCode ierr;
4171   const PetscInt *r,*ic,*ics;
4172   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4173   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4174   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4175   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4176   MatScalar      *v_work;
4177   PetscTruth     col_identity,row_identity,both_identity;
4178 
4179   PetscFunctionBegin;
4180   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4181   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4182 
4183   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4184   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
4185   ics  = ic;
4186 
4187   /* generate work space needed by dense LU factorization */
4188   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
4189 
4190   for (i=0; i<n; i++){
4191     /* zero rtmp */
4192     /* L part */
4193     nz    = bi[i+1] - bi[i];
4194     bjtmp = bj + bi[i];
4195     for  (j=0; j<nz; j++){
4196       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4197     }
4198 
4199     /* U part */
4200     nz = bdiag[i] - bdiag[i+1];
4201     bjtmp = bj + bdiag[i+1]+1;
4202     for  (j=0; j<nz; j++){
4203       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4204     }
4205 
4206     /* load in initial (unfactored row) */
4207     nz    = ai[r[i]+1] - ai[r[i]];
4208     ajtmp = aj + ai[r[i]];
4209     v     = aa + bs2*ai[r[i]];
4210     for (j=0; j<nz; j++) {
4211       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4212     }
4213 
4214     /* elimination */
4215     bjtmp = bj + bi[i];
4216     nzL   = bi[i+1] - bi[i];
4217     for(k=0;k < nzL;k++) {
4218       row = bjtmp[k];
4219       pc = rtmp + bs2*row;
4220       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4221       if (flg) {
4222         pv         = b->a + bs2*bdiag[row];
4223         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4224         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
4225         pv         = b->a + bs2*(bdiag[row+1]+1);
4226         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
4227         for (j=0; j<nz; j++) {
4228           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4229         }
4230         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4231       }
4232     }
4233 
4234     /* finished row so stick it into b->a */
4235     /* L part */
4236     pv   = b->a + bs2*bi[i] ;
4237     pj   = b->j + bi[i] ;
4238     nz   = bi[i+1] - bi[i];
4239     for (j=0; j<nz; j++) {
4240       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4241     }
4242 
4243     /* Mark diagonal and invert diagonal for simplier triangular solves */
4244     pv  = b->a + bs2*bdiag[i];
4245     pj  = b->j + bdiag[i];
4246     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4247     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4248     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4249 
4250     /* U part */
4251     pv = b->a + bs2*(bdiag[i+1]+1);
4252     pj = b->j + bdiag[i+1]+1;
4253     nz = bdiag[i] - bdiag[i+1] - 1;
4254     for (j=0; j<nz; j++){
4255       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4256     }
4257   }
4258 
4259   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4260   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
4261   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4262   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4263 
4264   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4265   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
4266   both_identity = (PetscTruth) (row_identity && col_identity);
4267   if (both_identity){
4268     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4269   } else {
4270     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
4271   }
4272 
4273   C->assembled = PETSC_TRUE;
4274   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4275   PetscFunctionReturn(0);
4276 }
4277 
4278 /*
4279    ilu(0) with natural ordering under new data structure.
4280    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
4281    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
4282 */
4283 
4284 #undef __FUNCT__
4285 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
4286 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4287 {
4288 
4289   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4290   PetscErrorCode     ierr;
4291   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
4292   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
4293 
4294   PetscFunctionBegin;
4295   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
4296   b    = (Mat_SeqBAIJ*)(fact)->data;
4297 
4298   /* allocate matrix arrays for new data structure */
4299   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
4300   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
4301   b->singlemalloc = PETSC_TRUE;
4302   if (!b->diag){
4303     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
4304     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
4305   }
4306   bdiag = b->diag;
4307 
4308   if (n > 0) {
4309     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
4310   }
4311 
4312   /* set bi and bj with new data structure */
4313   bi = b->i;
4314   bj = b->j;
4315 
4316   /* L part */
4317   bi[0] = 0;
4318   for (i=0; i<n; i++){
4319     nz = adiag[i] - ai[i];
4320     bi[i+1] = bi[i] + nz;
4321     aj = a->j + ai[i];
4322     for (j=0; j<nz; j++){
4323       *bj = aj[j]; bj++;
4324     }
4325   }
4326 
4327   /* U part */
4328   bi_temp = bi[n];
4329   bdiag[n] = bi[n]-1;
4330   for (i=n-1; i>=0; i--){
4331     nz = ai[i+1] - adiag[i] - 1;
4332     bi_temp = bi_temp + nz + 1;
4333     aj = a->j + adiag[i] + 1;
4334     for (j=0; j<nz; j++){
4335       *bj = aj[j]; bj++;
4336     }
4337     /* diag[i] */
4338     *bj = i; bj++;
4339     bdiag[i] = bi_temp - 1;
4340   }
4341   PetscFunctionReturn(0);
4342 }
4343 
4344 #undef __FUNCT__
4345 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
4346 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4347 {
4348   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4349   IS                 isicol;
4350   PetscErrorCode     ierr;
4351   const PetscInt     *r,*ic;
4352   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
4353   PetscInt           *bi,*cols,nnz,*cols_lvl;
4354   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
4355   PetscInt           i,levels,diagonal_fill;
4356   PetscTruth         col_identity,row_identity,both_identity;
4357   PetscReal          f;
4358   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
4359   PetscBT            lnkbt;
4360   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
4361   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
4362   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
4363   PetscTruth         missing;
4364   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
4365 
4366   PetscFunctionBegin;
4367   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
4368   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
4369   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
4370 
4371   f             = info->fill;
4372   levels        = (PetscInt)info->levels;
4373   diagonal_fill = (PetscInt)info->diagonal_fill;
4374   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4375 
4376   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4377   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4378   both_identity = (PetscTruth) (row_identity && col_identity);
4379 
4380   if (!levels && both_identity) {
4381     /* special case: ilu(0) with natural ordering */
4382     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4383     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
4384 
4385     fact->factor = MAT_FACTOR_ILU;
4386     (fact)->info.factor_mallocs    = 0;
4387     (fact)->info.fill_ratio_given  = info->fill;
4388     (fact)->info.fill_ratio_needed = 1.0;
4389     b                = (Mat_SeqBAIJ*)(fact)->data;
4390     b->row           = isrow;
4391     b->col           = iscol;
4392     b->icol          = isicol;
4393     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4394     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4395     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4396     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4397     PetscFunctionReturn(0);
4398   }
4399 
4400   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4401   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4402 
4403   /* get new row pointers */
4404   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
4405   bi[0] = 0;
4406   /* bdiag is location of diagonal in factor */
4407   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
4408   bdiag[0]  = 0;
4409 
4410   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
4411 
4412   /* create a linked list for storing column indices of the active row */
4413   nlnk = n + 1;
4414   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4415 
4416   /* initial FreeSpace size is f*(ai[n]+1) */
4417   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
4418   current_space = free_space;
4419   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
4420   current_space_lvl = free_space_lvl;
4421 
4422   for (i=0; i<n; i++) {
4423     nzi = 0;
4424     /* copy current row into linked list */
4425     nnz  = ai[r[i]+1] - ai[r[i]];
4426     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
4427     cols = aj + ai[r[i]];
4428     lnk[i] = -1; /* marker to indicate if diagonal exists */
4429     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4430     nzi += nlnk;
4431 
4432     /* make sure diagonal entry is included */
4433     if (diagonal_fill && lnk[i] == -1) {
4434       fm = n;
4435       while (lnk[fm] < i) fm = lnk[fm];
4436       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
4437       lnk[fm]    = i;
4438       lnk_lvl[i] = 0;
4439       nzi++; dcount++;
4440     }
4441 
4442     /* add pivot rows into the active row */
4443     nzbd = 0;
4444     prow = lnk[n];
4445     while (prow < i) {
4446       nnz      = bdiag[prow];
4447       cols     = bj_ptr[prow] + nnz + 1;
4448       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
4449       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
4450       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
4451       nzi += nlnk;
4452       prow = lnk[prow];
4453       nzbd++;
4454     }
4455     bdiag[i] = nzbd;
4456     bi[i+1]  = bi[i] + nzi;
4457 
4458     /* if free space is not available, make more free space */
4459     if (current_space->local_remaining<nzi) {
4460       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
4461       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
4462       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
4463       reallocs++;
4464     }
4465 
4466     /* copy data into free_space and free_space_lvl, then initialize lnk */
4467     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
4468     bj_ptr[i]    = current_space->array;
4469     bjlvl_ptr[i] = current_space_lvl->array;
4470 
4471     /* make sure the active row i has diagonal entry */
4472     if (*(bj_ptr[i]+bdiag[i]) != i) {
4473       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4474     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
4475     }
4476 
4477     current_space->array           += nzi;
4478     current_space->local_used      += nzi;
4479     current_space->local_remaining -= nzi;
4480     current_space_lvl->array           += nzi;
4481     current_space_lvl->local_used      += nzi;
4482     current_space_lvl->local_remaining -= nzi;
4483   }
4484 
4485   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4486   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4487 
4488   /* destroy list of free space and other temporary arrays */
4489   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
4490 
4491   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
4492   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
4493 
4494   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
4495   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
4496   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
4497 
4498 #if defined(PETSC_USE_INFO)
4499   {
4500     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
4501     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
4502     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4503     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
4504     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4505     if (diagonal_fill) {
4506       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
4507     }
4508   }
4509 #endif
4510 
4511   /* put together the new matrix */
4512   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4513   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4514   b = (Mat_SeqBAIJ*)(fact)->data;
4515   b->free_a       = PETSC_TRUE;
4516   b->free_ij      = PETSC_TRUE;
4517   b->singlemalloc = PETSC_FALSE;
4518   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4519   b->j          = bj;
4520   b->i          = bi;
4521   b->diag       = bdiag;
4522   b->free_diag  = PETSC_TRUE;
4523   b->ilen       = 0;
4524   b->imax       = 0;
4525   b->row        = isrow;
4526   b->col        = iscol;
4527   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4528   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4529   b->icol       = isicol;
4530   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4531   /* In b structure:  Free imax, ilen, old a, old j.
4532      Allocate bdiag, solve_work, new a, new j */
4533   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
4534   b->maxnz = b->nz = bdiag[0]+1;
4535   fact->info.factor_mallocs    = reallocs;
4536   fact->info.fill_ratio_given  = f;
4537   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
4538   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
4539   PetscFunctionReturn(0);
4540 }
4541 
4542 
4543 /*
4544      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
4545    except that the data structure of Mat_SeqAIJ is slightly different.
4546    Not a good example of code reuse.
4547 */
4548 #undef __FUNCT__
4549 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
4550 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4551 {
4552   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
4553   IS             isicol;
4554   PetscErrorCode ierr;
4555   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
4556   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4557   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4558   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
4559   PetscTruth     col_identity,row_identity,both_identity,flg;
4560   PetscReal      f;
4561   PetscTruth     newdatastruct = PETSC_FALSE;
4562 
4563   PetscFunctionBegin;
4564   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
4565   if (newdatastruct){
4566     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4567     PetscFunctionReturn(0);
4568   }
4569 
4570   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
4571   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
4572 
4573   f             = info->fill;
4574   levels        = (PetscInt)info->levels;
4575   diagonal_fill = (PetscInt)info->diagonal_fill;
4576   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4577 
4578   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4579   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4580   both_identity = (PetscTruth) (row_identity && col_identity);
4581 
4582   if (!levels && both_identity) {  /* special case copy the nonzero structure */
4583     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
4584     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4585 
4586     fact->factor = MAT_FACTOR_ILU;
4587     b            = (Mat_SeqBAIJ*)fact->data;
4588     b->row       = isrow;
4589     b->col       = iscol;
4590     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4591     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4592     b->icol      = isicol;
4593     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4594     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4595     PetscFunctionReturn(0);
4596   }
4597 
4598   /* general case perform the symbolic factorization */
4599     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4600     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4601 
4602     /* get new row pointers */
4603     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
4604     ainew[0] = 0;
4605     /* don't know how many column pointers are needed so estimate */
4606     jmax = (PetscInt)(f*ai[n] + 1);
4607     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
4608     /* ajfill is level of fill for each fill entry */
4609     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
4610     /* fill is a linked list of nonzeros in active row */
4611     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
4612     /* im is level for each filled value */
4613     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
4614     /* dloc is location of diagonal in factor */
4615     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
4616     dloc[0]  = 0;
4617     for (prow=0; prow<n; prow++) {
4618 
4619       /* copy prow into linked list */
4620       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
4621       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
4622       xi         = aj + ai[r[prow]];
4623       fill[n]    = n;
4624       fill[prow] = -1; /* marker for diagonal entry */
4625       while (nz--) {
4626 	fm  = n;
4627 	idx = ic[*xi++];
4628 	do {
4629 	  m  = fm;
4630 	  fm = fill[m];
4631 	} while (fm < idx);
4632 	fill[m]   = idx;
4633 	fill[idx] = fm;
4634 	im[idx]   = 0;
4635       }
4636 
4637       /* make sure diagonal entry is included */
4638       if (diagonal_fill && fill[prow] == -1) {
4639 	fm = n;
4640 	while (fill[fm] < prow) fm = fill[fm];
4641 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4642 	fill[fm]   = prow;
4643 	im[prow]   = 0;
4644 	nzf++;
4645 	dcount++;
4646       }
4647 
4648       nzi = 0;
4649       row = fill[n];
4650       while (row < prow) {
4651 	incrlev = im[row] + 1;
4652 	nz      = dloc[row];
4653 	xi      = ajnew  + ainew[row] + nz + 1;
4654 	flev    = ajfill + ainew[row] + nz + 1;
4655 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
4656 	fm      = row;
4657 	while (nnz-- > 0) {
4658 	  idx = *xi++;
4659 	  if (*flev + incrlev > levels) {
4660 	    flev++;
4661 	    continue;
4662 	  }
4663 	  do {
4664 	    m  = fm;
4665 	    fm = fill[m];
4666 	  } while (fm < idx);
4667 	  if (fm != idx) {
4668 	    im[idx]   = *flev + incrlev;
4669 	    fill[m]   = idx;
4670 	    fill[idx] = fm;
4671 	    fm        = idx;
4672 	    nzf++;
4673 	  } else {
4674 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
4675 	  }
4676 	  flev++;
4677 	}
4678 	row = fill[row];
4679 	nzi++;
4680       }
4681       /* copy new filled row into permanent storage */
4682       ainew[prow+1] = ainew[prow] + nzf;
4683       if (ainew[prow+1] > jmax) {
4684 
4685 	/* estimate how much additional space we will need */
4686 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4687 	/* just double the memory each time */
4688 	PetscInt maxadd = jmax;
4689 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
4690 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
4691 	jmax += maxadd;
4692 
4693 	/* allocate a longer ajnew and ajfill */
4694 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4695 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4696 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
4697 	ajnew = xitmp;
4698 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
4699 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4700 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
4701 	ajfill = xitmp;
4702 	reallocate++; /* count how many reallocations are needed */
4703       }
4704       xitmp       = ajnew + ainew[prow];
4705       flev        = ajfill + ainew[prow];
4706       dloc[prow]  = nzi;
4707       fm          = fill[n];
4708       while (nzf--) {
4709 	*xitmp++ = fm;
4710 	*flev++ = im[fm];
4711 	fm      = fill[fm];
4712       }
4713       /* make sure row has diagonal entry */
4714       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
4715 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4716     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4717       }
4718     }
4719     ierr = PetscFree(ajfill);CHKERRQ(ierr);
4720     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4721     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4722     ierr = PetscFree(fill);CHKERRQ(ierr);
4723     ierr = PetscFree(im);CHKERRQ(ierr);
4724 
4725 #if defined(PETSC_USE_INFO)
4726     {
4727       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4728       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4729       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4730       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4731       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4732       if (diagonal_fill) {
4733 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4734       }
4735     }
4736 #endif
4737 
4738     /* put together the new matrix */
4739     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4740     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4741     b    = (Mat_SeqBAIJ*)fact->data;
4742     b->free_a       = PETSC_TRUE;
4743     b->free_ij      = PETSC_TRUE;
4744     b->singlemalloc = PETSC_FALSE;
4745     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4746     b->j          = ajnew;
4747     b->i          = ainew;
4748     for (i=0; i<n; i++) dloc[i] += ainew[i];
4749     b->diag       = dloc;
4750     b->free_diag  = PETSC_TRUE;
4751     b->ilen       = 0;
4752     b->imax       = 0;
4753     b->row        = isrow;
4754     b->col        = iscol;
4755     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4756     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4757     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4758     b->icol       = isicol;
4759     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4760     /* In b structure:  Free imax, ilen, old a, old j.
4761        Allocate dloc, solve_work, new a, new j */
4762     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
4763     b->maxnz          = b->nz = ainew[n];
4764 
4765     fact->info.factor_mallocs    = reallocate;
4766     fact->info.fill_ratio_given  = f;
4767     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
4768 
4769   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
4770   PetscFunctionReturn(0);
4771 }
4772 
4773 #undef __FUNCT__
4774 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4775 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
4776 {
4777   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
4778   /* int i,*AJ=a->j,nz=a->nz; */
4779   PetscFunctionBegin;
4780   /* Undo Column scaling */
4781 /*    while (nz--) { */
4782 /*      AJ[i] = AJ[i]/4; */
4783 /*    } */
4784   /* This should really invoke a push/pop logic, but we don't have that yet. */
4785   A->ops->setunfactored = PETSC_NULL;
4786   PetscFunctionReturn(0);
4787 }
4788 
4789 #undef __FUNCT__
4790 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4791 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
4792 {
4793   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4794   PetscInt       *AJ=a->j,nz=a->nz;
4795   unsigned short *aj=(unsigned short *)AJ;
4796   PetscFunctionBegin;
4797   /* Is this really necessary? */
4798   while (nz--) {
4799     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
4800   }
4801   A->ops->setunfactored = PETSC_NULL;
4802   PetscFunctionReturn(0);
4803 }
4804 
4805 
4806