xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 78bb40077513d5120f4c52e0fb25a84efb280004)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124   PetscInt       *diag = a->diag,oidx;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,s3,x1,x2,x3;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137 
138     v     = aa + 9*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144     v += 9;
145 
146     vi    = aj + diag[i] + 1;
147     nz    = ai[i+1] - diag[i] - 1;
148     while (nz--) {
149       oidx = 3*(*vi++);
150       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153       v  += 9;
154     }
155     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156     idx += 3;
157   }
158   /* backward solve the L^T */
159   for (i=n-1; i>=0; i--){
160     v    = aa + 9*diag[i] - 9;
161     vi   = aj + diag[i] - 1;
162     nz   = diag[i] - ai[i];
163     idt  = 3*i;
164     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165     while (nz--) {
166       idx   = 3*(*vi--);
167       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170       v -= 9;
171     }
172   }
173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182 {
183   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184   PetscErrorCode ierr;
185   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186   PetscInt       *diag = a->diag,oidx;
187   MatScalar      *aa=a->a,*v;
188   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
189   PetscScalar    *x,*b;
190 
191   PetscFunctionBegin;
192   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195 
196   /* forward solve the U^T */
197   idx = 0;
198   for (i=0; i<n; i++) {
199 
200     v     = aa + 16*diag[i];
201     /* multiply by the inverse of the block diagonal */
202     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207     v += 16;
208 
209     vi    = aj + diag[i] + 1;
210     nz    = ai[i+1] - diag[i] - 1;
211     while (nz--) {
212       oidx = 4*(*vi++);
213       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217       v  += 16;
218     }
219     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220     idx += 4;
221   }
222   /* backward solve the L^T */
223   for (i=n-1; i>=0; i--){
224     v    = aa + 16*diag[i] - 16;
225     vi   = aj + diag[i] - 1;
226     nz   = diag[i] - ai[i];
227     idt  = 4*i;
228     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229     while (nz--) {
230       idx   = 4*(*vi--);
231       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235       v -= 16;
236     }
237   }
238   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247 {
248   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249   PetscErrorCode ierr;
250   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251   PetscInt       *diag = a->diag,oidx;
252   MatScalar      *aa=a->a,*v;
253   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
254   PetscScalar    *x,*b;
255 
256   PetscFunctionBegin;
257   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260 
261   /* forward solve the U^T */
262   idx = 0;
263   for (i=0; i<n; i++) {
264 
265     v     = aa + 25*diag[i];
266     /* multiply by the inverse of the block diagonal */
267     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273     v += 25;
274 
275     vi    = aj + diag[i] + 1;
276     nz    = ai[i+1] - diag[i] - 1;
277     while (nz--) {
278       oidx = 5*(*vi++);
279       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284       v  += 25;
285     }
286     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287     idx += 5;
288   }
289   /* backward solve the L^T */
290   for (i=n-1; i>=0; i--){
291     v    = aa + 25*diag[i] - 25;
292     vi   = aj + diag[i] - 1;
293     nz   = diag[i] - ai[i];
294     idt  = 5*i;
295     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296     while (nz--) {
297       idx   = 5*(*vi--);
298       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303       v -= 25;
304     }
305   }
306   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
307   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315 {
316   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317   PetscErrorCode ierr;
318   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319   PetscInt       *diag = a->diag,oidx;
320   MatScalar      *aa=a->a,*v;
321   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
322   PetscScalar    *x,*b;
323 
324   PetscFunctionBegin;
325   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
326   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
327   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328 
329   /* forward solve the U^T */
330   idx = 0;
331   for (i=0; i<n; i++) {
332 
333     v     = aa + 36*diag[i];
334     /* multiply by the inverse of the block diagonal */
335     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336     x6    = x[5+idx];
337     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343     v += 36;
344 
345     vi    = aj + diag[i] + 1;
346     nz    = ai[i+1] - diag[i] - 1;
347     while (nz--) {
348       oidx = 6*(*vi++);
349       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355       v  += 36;
356     }
357     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358     x[5+idx] = s6;
359     idx += 6;
360   }
361   /* backward solve the L^T */
362   for (i=n-1; i>=0; i--){
363     v    = aa + 36*diag[i] - 36;
364     vi   = aj + diag[i] - 1;
365     nz   = diag[i] - ai[i];
366     idt  = 6*i;
367     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368     s6 = x[5+idt];
369     while (nz--) {
370       idx   = 6*(*vi--);
371       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377       v -= 36;
378     }
379   }
380   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389 {
390   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391   PetscErrorCode ierr;
392   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393   PetscInt       *diag = a->diag,oidx;
394   MatScalar      *aa=a->a,*v;
395   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
396   PetscScalar    *x,*b;
397 
398   PetscFunctionBegin;
399   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
400   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
401   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402 
403   /* forward solve the U^T */
404   idx = 0;
405   for (i=0; i<n; i++) {
406 
407     v     = aa + 49*diag[i];
408     /* multiply by the inverse of the block diagonal */
409     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410     x6    = x[5+idx]; x7 = x[6+idx];
411     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418     v += 49;
419 
420     vi    = aj + diag[i] + 1;
421     nz    = ai[i+1] - diag[i] - 1;
422     while (nz--) {
423       oidx = 7*(*vi++);
424       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431       v  += 49;
432     }
433     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434     x[5+idx] = s6;x[6+idx] = s7;
435     idx += 7;
436   }
437   /* backward solve the L^T */
438   for (i=n-1; i>=0; i--){
439     v    = aa + 49*diag[i] - 49;
440     vi   = aj + diag[i] - 1;
441     nz   = diag[i] - ai[i];
442     idt  = 7*i;
443     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444     s6 = x[5+idt];s7 = x[6+idt];
445     while (nz--) {
446       idx   = 7*(*vi--);
447       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454       v -= 49;
455     }
456   }
457   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 /*---------------------------------------------------------------------------------------------*/
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469   IS             iscol=a->col,isrow=a->row;
470   PetscErrorCode ierr;
471   const PetscInt *r,*c,*rout,*cout;
472   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473   PetscInt       *diag = a->diag;
474   MatScalar      *aa=a->a,*v;
475   PetscScalar    s1,*x,*b,*t;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t  = a->solve_work;
481 
482   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484 
485   /* copy the b into temp work space according to permutation */
486   for (i=0; i<n; i++) {
487     t[i] = b[c[i]];
488   }
489 
490   /* forward solve the U^T */
491   for (i=0; i<n; i++) {
492 
493     v     = aa + diag[i];
494     /* multiply by the inverse of the block diagonal */
495     s1    = (*v++)*t[i];
496     vi    = aj + diag[i] + 1;
497     nz    = ai[i+1] - diag[i] - 1;
498     while (nz--) {
499       t[*vi++]  -= (*v++)*s1;
500     }
501     t[i]   = s1;
502   }
503   /* backward solve the L^T */
504   for (i=n-1; i>=0; i--){
505     v    = aa + diag[i] - 1;
506     vi   = aj + diag[i] - 1;
507     nz   = diag[i] - ai[i];
508     s1   = t[i];
509     while (nz--) {
510       t[*vi--]   -=  (*v--)*s1;
511     }
512   }
513 
514   /* copy t into x according to permutation */
515   for (i=0; i<n; i++) {
516     x[r[i]]   = t[i];
517   }
518 
519   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
521   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524   PetscFunctionReturn(0);
525 }
526 
527 #undef __FUNCT__
528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530 {
531   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532   IS             iscol=a->col,isrow=a->row;
533   PetscErrorCode ierr;
534   const PetscInt *r,*c,*rout,*cout;
535   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537   MatScalar      *aa=a->a,*v;
538   PetscScalar    s1,s2,x1,x2;
539   PetscScalar    *x,*b,*t;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544   t  = a->solve_work;
545 
546   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548 
549   /* copy the b into temp work space according to permutation */
550   ii = 0;
551   for (i=0; i<n; i++) {
552     ic      = 2*c[i];
553     t[ii]   = b[ic];
554     t[ii+1] = b[ic+1];
555     ii += 2;
556   }
557 
558   /* forward solve the U^T */
559   idx = 0;
560   for (i=0; i<n; i++) {
561 
562     v     = aa + 4*diag[i];
563     /* multiply by the inverse of the block diagonal */
564     x1    = t[idx];   x2 = t[1+idx];
565     s1 = v[0]*x1  +  v[1]*x2;
566     s2 = v[2]*x1  +  v[3]*x2;
567     v += 4;
568 
569     vi    = aj + diag[i] + 1;
570     nz    = ai[i+1] - diag[i] - 1;
571     while (nz--) {
572       oidx = 2*(*vi++);
573       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575       v  += 4;
576     }
577     t[idx]   = s1;t[1+idx] = s2;
578     idx += 2;
579   }
580   /* backward solve the L^T */
581   for (i=n-1; i>=0; i--){
582     v    = aa + 4*diag[i] - 4;
583     vi   = aj + diag[i] - 1;
584     nz   = diag[i] - ai[i];
585     idt  = 2*i;
586     s1 = t[idt];  s2 = t[1+idt];
587     while (nz--) {
588       idx   = 2*(*vi--);
589       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591       v -= 4;
592     }
593   }
594 
595   /* copy t into x according to permutation */
596   ii = 0;
597   for (i=0; i<n; i++) {
598     ir      = 2*r[i];
599     x[ir]   = t[ii];
600     x[ir+1] = t[ii+1];
601     ii += 2;
602   }
603 
604   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
606   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615 {
616   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617   IS             iscol=a->col,isrow=a->row;
618   PetscErrorCode ierr;
619   const PetscInt *r,*c,*rout,*cout;
620   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622   MatScalar      *aa=a->a,*v;
623   PetscScalar    s1,s2,s3,x1,x2,x3;
624   PetscScalar    *x,*b,*t;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   t  = a->solve_work;
630 
631   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633 
634   /* copy the b into temp work space according to permutation */
635   ii = 0;
636   for (i=0; i<n; i++) {
637     ic      = 3*c[i];
638     t[ii]   = b[ic];
639     t[ii+1] = b[ic+1];
640     t[ii+2] = b[ic+2];
641     ii += 3;
642   }
643 
644   /* forward solve the U^T */
645   idx = 0;
646   for (i=0; i<n; i++) {
647 
648     v     = aa + 9*diag[i];
649     /* multiply by the inverse of the block diagonal */
650     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654     v += 9;
655 
656     vi    = aj + diag[i] + 1;
657     nz    = ai[i+1] - diag[i] - 1;
658     while (nz--) {
659       oidx = 3*(*vi++);
660       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663       v  += 9;
664     }
665     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666     idx += 3;
667   }
668   /* backward solve the L^T */
669   for (i=n-1; i>=0; i--){
670     v    = aa + 9*diag[i] - 9;
671     vi   = aj + diag[i] - 1;
672     nz   = diag[i] - ai[i];
673     idt  = 3*i;
674     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675     while (nz--) {
676       idx   = 3*(*vi--);
677       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680       v -= 9;
681     }
682   }
683 
684   /* copy t into x according to permutation */
685   ii = 0;
686   for (i=0; i<n; i++) {
687     ir      = 3*r[i];
688     x[ir]   = t[ii];
689     x[ir+1] = t[ii+1];
690     x[ir+2] = t[ii+2];
691     ii += 3;
692   }
693 
694   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
696   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
697   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705 {
706   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707   IS             iscol=a->col,isrow=a->row;
708   PetscErrorCode ierr;
709   const PetscInt *r,*c,*rout,*cout;
710   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712   MatScalar      *aa=a->a,*v;
713   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
714   PetscScalar    *x,*b,*t;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   t  = a->solve_work;
720 
721   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723 
724   /* copy the b into temp work space according to permutation */
725   ii = 0;
726   for (i=0; i<n; i++) {
727     ic      = 4*c[i];
728     t[ii]   = b[ic];
729     t[ii+1] = b[ic+1];
730     t[ii+2] = b[ic+2];
731     t[ii+3] = b[ic+3];
732     ii += 4;
733   }
734 
735   /* forward solve the U^T */
736   idx = 0;
737   for (i=0; i<n; i++) {
738 
739     v     = aa + 16*diag[i];
740     /* multiply by the inverse of the block diagonal */
741     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746     v += 16;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 4*(*vi++);
752       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756       v  += 16;
757     }
758     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759     idx += 4;
760   }
761   /* backward solve the L^T */
762   for (i=n-1; i>=0; i--){
763     v    = aa + 16*diag[i] - 16;
764     vi   = aj + diag[i] - 1;
765     nz   = diag[i] - ai[i];
766     idt  = 4*i;
767     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768     while (nz--) {
769       idx   = 4*(*vi--);
770       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774       v -= 16;
775     }
776   }
777 
778   /* copy t into x according to permutation */
779   ii = 0;
780   for (i=0; i<n; i++) {
781     ir      = 4*r[i];
782     x[ir]   = t[ii];
783     x[ir+1] = t[ii+1];
784     x[ir+2] = t[ii+2];
785     x[ir+3] = t[ii+3];
786     ii += 4;
787   }
788 
789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
791   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800 {
801   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802   IS             iscol=a->col,isrow=a->row;
803   PetscErrorCode ierr;
804   const PetscInt *r,*c,*rout,*cout;
805   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807   MatScalar      *aa=a->a,*v;
808   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
809   PetscScalar    *x,*b,*t;
810 
811   PetscFunctionBegin;
812   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   t  = a->solve_work;
815 
816   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818 
819   /* copy the b into temp work space according to permutation */
820   ii = 0;
821   for (i=0; i<n; i++) {
822     ic      = 5*c[i];
823     t[ii]   = b[ic];
824     t[ii+1] = b[ic+1];
825     t[ii+2] = b[ic+2];
826     t[ii+3] = b[ic+3];
827     t[ii+4] = b[ic+4];
828     ii += 5;
829   }
830 
831   /* forward solve the U^T */
832   idx = 0;
833   for (i=0; i<n; i++) {
834 
835     v     = aa + 25*diag[i];
836     /* multiply by the inverse of the block diagonal */
837     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843     v += 25;
844 
845     vi    = aj + diag[i] + 1;
846     nz    = ai[i+1] - diag[i] - 1;
847     while (nz--) {
848       oidx = 5*(*vi++);
849       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854       v  += 25;
855     }
856     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857     idx += 5;
858   }
859   /* backward solve the L^T */
860   for (i=n-1; i>=0; i--){
861     v    = aa + 25*diag[i] - 25;
862     vi   = aj + diag[i] - 1;
863     nz   = diag[i] - ai[i];
864     idt  = 5*i;
865     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866     while (nz--) {
867       idx   = 5*(*vi--);
868       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873       v -= 25;
874     }
875   }
876 
877   /* copy t into x according to permutation */
878   ii = 0;
879   for (i=0; i<n; i++) {
880     ir      = 5*r[i];
881     x[ir]   = t[ii];
882     x[ir+1] = t[ii+1];
883     x[ir+2] = t[ii+2];
884     x[ir+3] = t[ii+3];
885     x[ir+4] = t[ii+4];
886     ii += 5;
887   }
888 
889   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902   IS             iscol=a->col,isrow=a->row;
903   PetscErrorCode ierr;
904   const PetscInt *r,*c,*rout,*cout;
905   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907   MatScalar      *aa=a->a,*v;
908   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
909   PetscScalar    *x,*b,*t;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   t  = a->solve_work;
915 
916   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918 
919   /* copy the b into temp work space according to permutation */
920   ii = 0;
921   for (i=0; i<n; i++) {
922     ic      = 6*c[i];
923     t[ii]   = b[ic];
924     t[ii+1] = b[ic+1];
925     t[ii+2] = b[ic+2];
926     t[ii+3] = b[ic+3];
927     t[ii+4] = b[ic+4];
928     t[ii+5] = b[ic+5];
929     ii += 6;
930   }
931 
932   /* forward solve the U^T */
933   idx = 0;
934   for (i=0; i<n; i++) {
935 
936     v     = aa + 36*diag[i];
937     /* multiply by the inverse of the block diagonal */
938     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939     x6    = t[5+idx];
940     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946     v += 36;
947 
948     vi    = aj + diag[i] + 1;
949     nz    = ai[i+1] - diag[i] - 1;
950     while (nz--) {
951       oidx = 6*(*vi++);
952       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958       v  += 36;
959     }
960     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961     t[5+idx] = s6;
962     idx += 6;
963   }
964   /* backward solve the L^T */
965   for (i=n-1; i>=0; i--){
966     v    = aa + 36*diag[i] - 36;
967     vi   = aj + diag[i] - 1;
968     nz   = diag[i] - ai[i];
969     idt  = 6*i;
970     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971     s6 = t[5+idt];
972     while (nz--) {
973       idx   = 6*(*vi--);
974       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980       v -= 36;
981     }
982   }
983 
984   /* copy t into x according to permutation */
985   ii = 0;
986   for (i=0; i<n; i++) {
987     ir      = 6*r[i];
988     x[ir]   = t[ii];
989     x[ir+1] = t[ii+1];
990     x[ir+2] = t[ii+2];
991     x[ir+3] = t[ii+3];
992     x[ir+4] = t[ii+4];
993     x[ir+5] = t[ii+5];
994     ii += 6;
995   }
996 
997   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
999   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008 {
1009   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010   IS             iscol=a->col,isrow=a->row;
1011   PetscErrorCode ierr;
1012   const PetscInt *r,*c,*rout,*cout;
1013   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015   MatScalar      *aa=a->a,*v;
1016   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1017   PetscScalar    *x,*b,*t;
1018 
1019   PetscFunctionBegin;
1020   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022   t  = a->solve_work;
1023 
1024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026 
1027   /* copy the b into temp work space according to permutation */
1028   ii = 0;
1029   for (i=0; i<n; i++) {
1030     ic      = 7*c[i];
1031     t[ii]   = b[ic];
1032     t[ii+1] = b[ic+1];
1033     t[ii+2] = b[ic+2];
1034     t[ii+3] = b[ic+3];
1035     t[ii+4] = b[ic+4];
1036     t[ii+5] = b[ic+5];
1037     t[ii+6] = b[ic+6];
1038     ii += 7;
1039   }
1040 
1041   /* forward solve the U^T */
1042   idx = 0;
1043   for (i=0; i<n; i++) {
1044 
1045     v     = aa + 49*diag[i];
1046     /* multiply by the inverse of the block diagonal */
1047     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048     x6    = t[5+idx]; x7 = t[6+idx];
1049     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056     v += 49;
1057 
1058     vi    = aj + diag[i] + 1;
1059     nz    = ai[i+1] - diag[i] - 1;
1060     while (nz--) {
1061       oidx = 7*(*vi++);
1062       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069       v  += 49;
1070     }
1071     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072     t[5+idx] = s6;t[6+idx] = s7;
1073     idx += 7;
1074   }
1075   /* backward solve the L^T */
1076   for (i=n-1; i>=0; i--){
1077     v    = aa + 49*diag[i] - 49;
1078     vi   = aj + diag[i] - 1;
1079     nz   = diag[i] - ai[i];
1080     idt  = 7*i;
1081     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082     s6 = t[5+idt];s7 = t[6+idt];
1083     while (nz--) {
1084       idx   = 7*(*vi--);
1085       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092       v -= 49;
1093     }
1094   }
1095 
1096   /* copy t into x according to permutation */
1097   ii = 0;
1098   for (i=0; i<n; i++) {
1099     ir      = 7*r[i];
1100     x[ir]   = t[ii];
1101     x[ir+1] = t[ii+1];
1102     x[ir+2] = t[ii+2];
1103     x[ir+3] = t[ii+3];
1104     x[ir+4] = t[ii+4];
1105     x[ir+5] = t[ii+5];
1106     x[ir+6] = t[ii+6];
1107     ii += 7;
1108   }
1109 
1110   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1112   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1113   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115   PetscFunctionReturn(0);
1116 }
1117 
1118 /* ----------------------------------------------------------- */
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1124   IS             iscol=a->col,isrow=a->row;
1125   PetscErrorCode ierr;
1126   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1127   PetscInt       i,n=a->mbs;
1128   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1129   MatScalar      *aa=a->a,*v;
1130   PetscScalar    *x,*b,*s,*t,*ls;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135   t  = a->solve_work;
1136 
1137   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1138   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1139 
1140   /* forward solve the lower triangular */
1141   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1142   for (i=1; i<n; i++) {
1143     v   = aa + bs2*ai[i];
1144     vi  = aj + ai[i];
1145     nz  = a->diag[i] - ai[i];
1146     s = t + bs*i;
1147     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1148     while (nz--) {
1149       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1150       v += bs2;
1151     }
1152   }
1153   /* backward solve the upper triangular */
1154   ls = a->solve_work + A->cmap->n;
1155   for (i=n-1; i>=0; i--){
1156     v   = aa + bs2*(a->diag[i] + 1);
1157     vi  = aj + a->diag[i] + 1;
1158     nz  = ai[i+1] - a->diag[i] - 1;
1159     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1160     while (nz--) {
1161       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1162       v += bs2;
1163     }
1164     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1165     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1166   }
1167 
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1179 {
1180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1181   IS             iscol=a->col,isrow=a->row;
1182   PetscErrorCode ierr;
1183   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1184   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1187   PetscScalar    *x,*b,*t;
1188 
1189   PetscFunctionBegin;
1190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192   t  = a->solve_work;
1193 
1194   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1195   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1196 
1197   /* forward solve the lower triangular */
1198   idx    = 7*(*r++);
1199   t[0] = b[idx];   t[1] = b[1+idx];
1200   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201   t[5] = b[5+idx]; t[6] = b[6+idx];
1202 
1203   for (i=1; i<n; i++) {
1204     v     = aa + 49*ai[i];
1205     vi    = aj + ai[i];
1206     nz    = diag[i] - ai[i];
1207     idx   = 7*(*r++);
1208     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1210     while (nz--) {
1211       idx   = 7*(*vi++);
1212       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213       x4    = t[3+idx];x5 = t[4+idx];
1214       x6    = t[5+idx];x7 = t[6+idx];
1215       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1222       v += 49;
1223     }
1224     idx = 7*i;
1225     t[idx]   = s1;t[1+idx] = s2;
1226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227     t[5+idx] = s6;t[6+idx] = s7;
1228   }
1229   /* backward solve the upper triangular */
1230   for (i=n-1; i>=0; i--){
1231     v    = aa + 49*diag[i] + 49;
1232     vi   = aj + diag[i] + 1;
1233     nz   = ai[i+1] - diag[i] - 1;
1234     idt  = 7*i;
1235     s1 = t[idt];  s2 = t[1+idt];
1236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237     s6 = t[5+idt];s7 = t[6+idt];
1238     while (nz--) {
1239       idx   = 7*(*vi++);
1240       x1    = t[idx];   x2 = t[1+idx];
1241       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242       x6    = t[5+idx]; x7 = t[6+idx];
1243       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1250       v += 49;
1251     }
1252     idc = 7*(*c--);
1253     v   = aa + 49*diag[i];
1254     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1268   }
1269 
1270   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1271   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1272   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1273   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 #undef __FUNCT__
1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1281 {
1282   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1283   IS             iscol=a->col,isrow=a->row;
1284   PetscErrorCode ierr;
1285   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
1286   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
1287   MatScalar      *aa=a->a,*v;
1288   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1289   PetscScalar    *x,*b,*t;
1290 
1291   PetscFunctionBegin;
1292   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1294   t  = a->solve_work;
1295 
1296   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1297   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1298 
1299   /* forward solve the lower triangular */
1300   idx    = 7*r[0];
1301   t[0] = b[idx];   t[1] = b[1+idx];
1302   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1303   t[5] = b[5+idx]; t[6] = b[6+idx];
1304 
1305   for (i=1; i<n; i++) {
1306     v     = aa + 49*ai[i];
1307     vi    = aj + ai[i];
1308     nz    = ai[i+1] - ai[i];
1309     idx   = 7*r[i];
1310     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1311     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1312     for(m=0;m<nz;m++){
1313       idx   = 7*vi[m];
1314       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1315       x4    = t[3+idx];x5 = t[4+idx];
1316       x6    = t[5+idx];x7 = t[6+idx];
1317       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1318       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1319       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1320       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1321       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1322       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1323       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1324       v += 49;
1325     }
1326     idx = 7*i;
1327     t[idx]   = s1;t[1+idx] = s2;
1328     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1329     t[5+idx] = s6;t[6+idx] = s7;
1330   }
1331   /* backward solve the upper triangular */
1332   for (i=n-1; i>=0; i--){
1333     k    = 2*n-i;
1334     v    = aa + 49*ai[k];
1335     vi   = aj + ai[k];
1336     nz   = ai[k+1] - ai[k] - 1;
1337     idt  = 7*i;
1338     s1 = t[idt];  s2 = t[1+idt];
1339     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1340     s6 = t[5+idt];s7 = t[6+idt];
1341     for(m=0;m<nz;m++){
1342       idx   = 7*vi[m];
1343       x1    = t[idx];   x2 = t[1+idx];
1344       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1345       x6    = t[5+idx]; x7 = t[6+idx];
1346       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1347       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1348       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1349       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1350       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1351       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1352       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1353       v += 49;
1354     }
1355     idc = 7*c[i];
1356     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1357                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1358     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1359                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1360     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1361                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1362     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1363                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1364     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1365                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1366     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1367                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1368     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1369                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1370   }
1371 
1372   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1373   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1374   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1383 {
1384   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386   PetscErrorCode    ierr;
1387   PetscInt          *diag = a->diag,jdx;
1388   const MatScalar   *aa=a->a,*v;
1389   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390   const PetscScalar *b;
1391 
1392   PetscFunctionBegin;
1393   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1394   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1395   /* forward solve the lower triangular */
1396   idx    = 0;
1397   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1398   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1399   x[6] = b[6+idx];
1400   for (i=1; i<n; i++) {
1401     v     =  aa + 49*ai[i];
1402     vi    =  aj + ai[i];
1403     nz    =  diag[i] - ai[i];
1404     idx   =  7*i;
1405     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407     s7  =  b[6+idx];
1408     while (nz--) {
1409       jdx   = 7*(*vi++);
1410       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1411       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1412       x7    = x[6+jdx];
1413       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1420       v += 49;
1421      }
1422     x[idx]   = s1;
1423     x[1+idx] = s2;
1424     x[2+idx] = s3;
1425     x[3+idx] = s4;
1426     x[4+idx] = s5;
1427     x[5+idx] = s6;
1428     x[6+idx] = s7;
1429   }
1430   /* backward solve the upper triangular */
1431   for (i=n-1; i>=0; i--){
1432     v    = aa + 49*diag[i] + 49;
1433     vi   = aj + diag[i] + 1;
1434     nz   = ai[i+1] - diag[i] - 1;
1435     idt  = 7*i;
1436     s1 = x[idt];   s2 = x[1+idt];
1437     s3 = x[2+idt]; s4 = x[3+idt];
1438     s5 = x[4+idt]; s6 = x[5+idt];
1439     s7 = x[6+idt];
1440     while (nz--) {
1441       idx   = 7*(*vi++);
1442       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1443       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1444       x7    = x[6+idx];
1445       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1452       v += 49;
1453     }
1454     v        = aa + 49*diag[i];
1455     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1469   }
1470 
1471   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1472   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 #undef __FUNCT__
1478 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480 {
1481     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1482     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483     PetscErrorCode    ierr;
1484     PetscInt          idx,jdx,idt;
1485     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486     const MatScalar   *aa=a->a,*v;
1487     PetscScalar       *x;
1488     const PetscScalar *b;
1489     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490 
1491     PetscFunctionBegin;
1492     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494     /* forward solve the lower triangular */
1495     idx    = 0;
1496     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498     for (i=1; i<n; i++) {
1499        v    = aa + bs2*ai[i];
1500        vi   = aj + ai[i];
1501        nz   = ai[i+1] - ai[i];
1502       idx   = bs*i;
1503        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1505        for(k=0;k<nz;k++) {
1506           jdx   = bs*vi[k];
1507           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516           v   +=  bs2;
1517         }
1518 
1519        x[idx]   = s1;
1520        x[1+idx] = s2;
1521        x[2+idx] = s3;
1522        x[3+idx] = s4;
1523        x[4+idx] = s5;
1524        x[5+idx] = s6;
1525        x[6+idx] = s7;
1526     }
1527 
1528    /* backward solve the upper triangular */
1529   for (i=n-1; i>=0; i--){
1530      v   = aa + bs2*ai[2*n-i];
1531      vi  = aj + ai[2*n-i];
1532      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533      idt = bs*i;
1534      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1536     for(k=0;k<nz;k++) {
1537       idx   = bs*vi[k];
1538        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547         v   +=  bs2;
1548     }
1549     /* x = inv_diagonal*x */
1550     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557   }
1558 
1559   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
1567 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1568 {
1569     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1570     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
1571     PetscErrorCode    ierr;
1572     PetscInt          idx,jdx,idt;
1573     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1574     const MatScalar   *aa=a->a,*v;
1575     PetscScalar       *x;
1576     const PetscScalar *b;
1577     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1578 
1579     PetscFunctionBegin;
1580     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1581     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1582     /* forward solve the lower triangular */
1583     idx    = 0;
1584     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1585     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1586     for (i=1; i<n; i++) {
1587        v    = aa + bs2*ai[i];
1588        vi   = aj + ai[i];
1589        nz   = ai[i+1] - ai[i];
1590       idx   = bs*i;
1591        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1592        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1593        for(k=0;k<nz;k++) {
1594           jdx   = bs*vi[k];
1595           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1596 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1597           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1598           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1599           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1600 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1601           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1602 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1603 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1604           v   +=  bs2;
1605         }
1606 
1607        x[idx]   = s1;
1608        x[1+idx] = s2;
1609        x[2+idx] = s3;
1610        x[3+idx] = s4;
1611        x[4+idx] = s5;
1612        x[5+idx] = s6;
1613        x[6+idx] = s7;
1614     }
1615 
1616    /* backward solve the upper triangular */
1617   for (i=n-1; i>=0; i--){
1618     v   = aa + bs2*(adiag[i+1]+1);
1619      vi  = aj + adiag[i+1]+1;
1620      nz  = adiag[i] - adiag[i+1]-1;
1621      idt = bs*i;
1622      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1623      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1624     for(k=0;k<nz;k++) {
1625       idx   = bs*vi[k];
1626        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1627        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1628        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1629        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1630        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1631        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1632        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1633        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1634        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1635         v   +=  bs2;
1636     }
1637     /* x = inv_diagonal*x */
1638     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1639     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1640     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1641     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1642     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1643     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1644     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1645   }
1646 
1647   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1648   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1649   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1650   PetscFunctionReturn(0);
1651 }
1652 
1653 #undef __FUNCT__
1654 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1655 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1656 {
1657   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1658   IS                iscol=a->col,isrow=a->row;
1659   PetscErrorCode    ierr;
1660   const PetscInt    *r,*c,*rout,*cout;
1661   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1662   const MatScalar   *aa=a->a,*v;
1663   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1664   const PetscScalar *b;
1665   PetscFunctionBegin;
1666   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1667   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668   t  = a->solve_work;
1669 
1670   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1671   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1672 
1673   /* forward solve the lower triangular */
1674   idx    = 6*(*r++);
1675   t[0] = b[idx];   t[1] = b[1+idx];
1676   t[2] = b[2+idx]; t[3] = b[3+idx];
1677   t[4] = b[4+idx]; t[5] = b[5+idx];
1678   for (i=1; i<n; i++) {
1679     v     = aa + 36*ai[i];
1680     vi    = aj + ai[i];
1681     nz    = diag[i] - ai[i];
1682     idx   = 6*(*r++);
1683     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1684     s5  = b[4+idx]; s6 = b[5+idx];
1685     while (nz--) {
1686       idx   = 6*(*vi++);
1687       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1688       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1689       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1690       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1691       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1692       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1693       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1694       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1695       v += 36;
1696     }
1697     idx = 6*i;
1698     t[idx]   = s1;t[1+idx] = s2;
1699     t[2+idx] = s3;t[3+idx] = s4;
1700     t[4+idx] = s5;t[5+idx] = s6;
1701   }
1702   /* backward solve the upper triangular */
1703   for (i=n-1; i>=0; i--){
1704     v    = aa + 36*diag[i] + 36;
1705     vi   = aj + diag[i] + 1;
1706     nz   = ai[i+1] - diag[i] - 1;
1707     idt  = 6*i;
1708     s1 = t[idt];  s2 = t[1+idt];
1709     s3 = t[2+idt];s4 = t[3+idt];
1710     s5 = t[4+idt];s6 = t[5+idt];
1711     while (nz--) {
1712       idx   = 6*(*vi++);
1713       x1    = t[idx];   x2 = t[1+idx];
1714       x3    = t[2+idx]; x4 = t[3+idx];
1715       x5    = t[4+idx]; x6 = t[5+idx];
1716       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1717       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1718       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1719       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1720       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1721       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1722       v += 36;
1723     }
1724     idc = 6*(*c--);
1725     v   = aa + 36*diag[i];
1726     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1727                                  v[18]*s4+v[24]*s5+v[30]*s6;
1728     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1729                                  v[19]*s4+v[25]*s5+v[31]*s6;
1730     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1731                                  v[20]*s4+v[26]*s5+v[32]*s6;
1732     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1733                                  v[21]*s4+v[27]*s5+v[33]*s6;
1734     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1735                                  v[22]*s4+v[28]*s5+v[34]*s6;
1736     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1737                                  v[23]*s4+v[29]*s5+v[35]*s6;
1738   }
1739 
1740   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1741   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1742   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1743   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1744   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 #undef __FUNCT__
1749 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1750 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1751 {
1752   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1753   IS                iscol=a->col,isrow=a->row;
1754   PetscErrorCode    ierr;
1755   const PetscInt    *r,*c,*rout,*cout;
1756   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1759   const PetscScalar *b;
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t  = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* forward solve the lower triangular */
1769   idx    = 6*r[0];
1770   t[0] = b[idx];   t[1] = b[1+idx];
1771   t[2] = b[2+idx]; t[3] = b[3+idx];
1772   t[4] = b[4+idx]; t[5] = b[5+idx];
1773   for (i=1; i<n; i++) {
1774     v     = aa + 36*ai[i];
1775     vi    = aj + ai[i];
1776     nz    = ai[i+1] - ai[i];
1777     idx   = 6*r[i];
1778     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1779     s5  = b[4+idx]; s6 = b[5+idx];
1780     for(m=0;m<nz;m++){
1781       idx   = 6*vi[m];
1782       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1783       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1784       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1785       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1786       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1787       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1788       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1789       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1790       v += 36;
1791     }
1792     idx = 6*i;
1793     t[idx]   = s1;t[1+idx] = s2;
1794     t[2+idx] = s3;t[3+idx] = s4;
1795     t[4+idx] = s5;t[5+idx] = s6;
1796   }
1797   /* backward solve the upper triangular */
1798   for (i=n-1; i>=0; i--){
1799     k    = 2*n-i;
1800     v    = aa + 36*ai[k];
1801     vi   = aj + ai[k];
1802     nz   = ai[k+1] - ai[k] - 1;
1803     idt  = 6*i;
1804     s1 = t[idt];  s2 = t[1+idt];
1805     s3 = t[2+idt];s4 = t[3+idt];
1806     s5 = t[4+idt];s6 = t[5+idt];
1807     for(m=0;m<nz;m++){
1808       idx   = 6*vi[m];
1809       x1    = t[idx];   x2 = t[1+idx];
1810       x3    = t[2+idx]; x4 = t[3+idx];
1811       x5    = t[4+idx]; x6 = t[5+idx];
1812       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1813       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1814       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1815       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1816       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1817       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1818       v += 36;
1819     }
1820     idc = 6*c[i];
1821     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1822                                  v[18]*s4+v[24]*s5+v[30]*s6;
1823     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1824                                  v[19]*s4+v[25]*s5+v[31]*s6;
1825     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1826                                  v[20]*s4+v[26]*s5+v[32]*s6;
1827     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1828                                  v[21]*s4+v[27]*s5+v[33]*s6;
1829     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1830                                  v[22]*s4+v[28]*s5+v[34]*s6;
1831     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1832                                  v[23]*s4+v[29]*s5+v[35]*s6;
1833   }
1834 
1835   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1836   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1839   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1840   PetscFunctionReturn(0);
1841 }
1842 
1843 
1844 #undef __FUNCT__
1845 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1846 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1847 {
1848   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1849   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850   PetscErrorCode    ierr;
1851   PetscInt          *diag = a->diag,jdx;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1854   const PetscScalar *b;
1855 
1856   PetscFunctionBegin;
1857   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859   /* forward solve the lower triangular */
1860   idx    = 0;
1861   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1862   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1863   for (i=1; i<n; i++) {
1864     v     =  aa + 36*ai[i];
1865     vi    =  aj + ai[i];
1866     nz    =  diag[i] - ai[i];
1867     idx   =  6*i;
1868     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1869     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1870     while (nz--) {
1871       jdx   = 6*(*vi++);
1872       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1873       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1874       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1875       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1876       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1877       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1878       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1879       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1880       v += 36;
1881      }
1882     x[idx]   = s1;
1883     x[1+idx] = s2;
1884     x[2+idx] = s3;
1885     x[3+idx] = s4;
1886     x[4+idx] = s5;
1887     x[5+idx] = s6;
1888   }
1889   /* backward solve the upper triangular */
1890   for (i=n-1; i>=0; i--){
1891     v    = aa + 36*diag[i] + 36;
1892     vi   = aj + diag[i] + 1;
1893     nz   = ai[i+1] - diag[i] - 1;
1894     idt  = 6*i;
1895     s1 = x[idt];   s2 = x[1+idt];
1896     s3 = x[2+idt]; s4 = x[3+idt];
1897     s5 = x[4+idt]; s6 = x[5+idt];
1898     while (nz--) {
1899       idx   = 6*(*vi++);
1900       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1901       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1902       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1903       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1904       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1905       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1906       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1907       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1908       v += 36;
1909     }
1910     v        = aa + 36*diag[i];
1911     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1912     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1913     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1914     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1915     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1916     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1917   }
1918 
1919   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1920   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1921   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 #undef __FUNCT__
1926 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1927 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1928 {
1929     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1930     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1931     PetscErrorCode    ierr;
1932     PetscInt          idx,jdx,idt;
1933     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1934     const MatScalar   *aa=a->a,*v;
1935     PetscScalar       *x;
1936     const PetscScalar *b;
1937     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1938 
1939     PetscFunctionBegin;
1940     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1941     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1942     /* forward solve the lower triangular */
1943     idx    = 0;
1944     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1945     x[4] = b[4+idx];x[5] = b[5+idx];
1946     for (i=1; i<n; i++) {
1947        v    = aa + bs2*ai[i];
1948        vi   = aj + ai[i];
1949        nz   = ai[i+1] - ai[i];
1950       idx   = bs*i;
1951        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1952        s5   = b[4+idx];s6 = b[5+idx];
1953        for(k=0;k<nz;k++){
1954           jdx   = bs*vi[k];
1955           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1956 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1957           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1958           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1959           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1960 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1961           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1962 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1963           v   +=  bs2;
1964         }
1965 
1966        x[idx]   = s1;
1967        x[1+idx] = s2;
1968        x[2+idx] = s3;
1969        x[3+idx] = s4;
1970        x[4+idx] = s5;
1971        x[5+idx] = s6;
1972     }
1973 
1974    /* backward solve the upper triangular */
1975   for (i=n-1; i>=0; i--){
1976      v   = aa + bs2*ai[2*n-i];
1977      vi  = aj + ai[2*n-i];
1978      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1979      idt = bs*i;
1980      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1981      s5 = x[4+idt];s6 = x[5+idt];
1982      for(k=0;k<nz;k++){
1983       idx   = bs*vi[k];
1984        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1985        x5    = x[4+idx];x6 = x[5+idx];
1986        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1987        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1988        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1989        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1990        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1991        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1992         v   +=  bs2;
1993     }
1994     /* x = inv_diagonal*x */
1995    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1996    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1997    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1998    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1999    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2000    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2001   }
2002 
2003   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2004   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2005   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 #undef __FUNCT__
2010 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
2011 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2012 {
2013     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2014     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2015     PetscErrorCode    ierr;
2016     PetscInt          idx,jdx,idt;
2017     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2018     const MatScalar   *aa=a->a,*v;
2019     PetscScalar       *x;
2020     const PetscScalar *b;
2021     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2022 
2023     PetscFunctionBegin;
2024     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2025     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2026     /* forward solve the lower triangular */
2027     idx    = 0;
2028     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2029     x[4] = b[4+idx];x[5] = b[5+idx];
2030     for (i=1; i<n; i++) {
2031        v    = aa + bs2*ai[i];
2032        vi   = aj + ai[i];
2033        nz   = ai[i+1] - ai[i];
2034       idx   = bs*i;
2035        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2036        s5   = b[4+idx];s6 = b[5+idx];
2037        for(k=0;k<nz;k++){
2038           jdx   = bs*vi[k];
2039           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2040 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2041           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2042           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2043           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2044 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2045           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2046 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2047           v   +=  bs2;
2048         }
2049 
2050        x[idx]   = s1;
2051        x[1+idx] = s2;
2052        x[2+idx] = s3;
2053        x[3+idx] = s4;
2054        x[4+idx] = s5;
2055        x[5+idx] = s6;
2056     }
2057 
2058    /* backward solve the upper triangular */
2059   for (i=n-1; i>=0; i--){
2060     v   = aa + bs2*(adiag[i+1]+1);
2061      vi  = aj + adiag[i+1]+1;
2062      nz  = adiag[i] - adiag[i+1]-1;
2063      idt = bs*i;
2064      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2065      s5 = x[4+idt];s6 = x[5+idt];
2066      for(k=0;k<nz;k++){
2067       idx   = bs*vi[k];
2068        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2069        x5    = x[4+idx];x6 = x[5+idx];
2070        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2071        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2072        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2073        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2074        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2075        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2076         v   +=  bs2;
2077     }
2078     /* x = inv_diagonal*x */
2079    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2080    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2081    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2082    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2083    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2084    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2085   }
2086 
2087   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2088   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2089   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2090   PetscFunctionReturn(0);
2091 }
2092 
2093 #undef __FUNCT__
2094 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2095 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2096 {
2097   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2098   IS                iscol=a->col,isrow=a->row;
2099   PetscErrorCode    ierr;
2100   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2101   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2102   const MatScalar   *aa=a->a,*v;
2103   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2104   const PetscScalar *b;
2105 
2106   PetscFunctionBegin;
2107   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2109   t  = a->solve_work;
2110 
2111   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2112   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2113 
2114   /* forward solve the lower triangular */
2115   idx    = 5*(*r++);
2116   t[0] = b[idx];   t[1] = b[1+idx];
2117   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2118   for (i=1; i<n; i++) {
2119     v     = aa + 25*ai[i];
2120     vi    = aj + ai[i];
2121     nz    = diag[i] - ai[i];
2122     idx   = 5*(*r++);
2123     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2124     s5  = b[4+idx];
2125     while (nz--) {
2126       idx   = 5*(*vi++);
2127       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2128       x4    = t[3+idx];x5 = t[4+idx];
2129       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2130       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2131       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2132       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2133       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2134       v += 25;
2135     }
2136     idx = 5*i;
2137     t[idx]   = s1;t[1+idx] = s2;
2138     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2139   }
2140   /* backward solve the upper triangular */
2141   for (i=n-1; i>=0; i--){
2142     v    = aa + 25*diag[i] + 25;
2143     vi   = aj + diag[i] + 1;
2144     nz   = ai[i+1] - diag[i] - 1;
2145     idt  = 5*i;
2146     s1 = t[idt];  s2 = t[1+idt];
2147     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2148     while (nz--) {
2149       idx   = 5*(*vi++);
2150       x1    = t[idx];   x2 = t[1+idx];
2151       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2152       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2153       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2154       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2155       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2156       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2157       v += 25;
2158     }
2159     idc = 5*(*c--);
2160     v   = aa + 25*diag[i];
2161     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2162                                  v[15]*s4+v[20]*s5;
2163     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2164                                  v[16]*s4+v[21]*s5;
2165     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2166                                  v[17]*s4+v[22]*s5;
2167     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2168                                  v[18]*s4+v[23]*s5;
2169     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2170                                  v[19]*s4+v[24]*s5;
2171   }
2172 
2173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2177   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2178   PetscFunctionReturn(0);
2179 }
2180 
2181 #undef __FUNCT__
2182 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2183 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2184 {
2185   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2186   IS                iscol=a->col,isrow=a->row;
2187   PetscErrorCode    ierr;
2188   const PetscInt    *r,*c,*rout,*cout;
2189   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2190   const MatScalar   *aa=a->a,*v;
2191   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2192   const PetscScalar *b;
2193 
2194   PetscFunctionBegin;
2195   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2196   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2197   t  = a->solve_work;
2198 
2199   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2200   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2201 
2202   /* forward solve the lower triangular */
2203   idx    = 5*r[0];
2204   t[0] = b[idx];   t[1] = b[1+idx];
2205   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2206   for (i=1; i<n; i++) {
2207     v     = aa + 25*ai[i];
2208     vi    = aj + ai[i];
2209     nz    = ai[i+1] - ai[i];
2210     idx   = 5*r[i];
2211     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2212     s5  = b[4+idx];
2213     for(m=0;m<nz;m++){
2214       idx   = 5*vi[m];
2215       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2216       x4    = t[3+idx];x5 = t[4+idx];
2217       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2218       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2219       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2220       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2221       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2222       v += 25;
2223     }
2224     idx = 5*i;
2225     t[idx]   = s1;t[1+idx] = s2;
2226     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2227   }
2228   /* backward solve the upper triangular */
2229   for (i=n-1; i>=0; i--){
2230     k    = 2*n-i;
2231     v    = aa + 25*ai[k];
2232     vi   = aj + ai[k];
2233     nz   = ai[k+1] - ai[k] - 1;
2234     idt  = 5*i;
2235     s1 = t[idt];  s2 = t[1+idt];
2236     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2237     for(m=0;m<nz;m++){
2238       idx   = 5*vi[m];
2239       x1    = t[idx];   x2 = t[1+idx];
2240       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2241       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2242       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2243       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2244       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2245       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2246       v += 25;
2247     }
2248     idc = 5*c[i];
2249     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2250                                  v[15]*s4+v[20]*s5;
2251     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2252                                  v[16]*s4+v[21]*s5;
2253     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2254                                  v[17]*s4+v[22]*s5;
2255     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2256                                  v[18]*s4+v[23]*s5;
2257     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2258                                  v[19]*s4+v[24]*s5;
2259   }
2260 
2261   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2262   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2263   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2264   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2265   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2266   PetscFunctionReturn(0);
2267 }
2268 
2269 #undef __FUNCT__
2270 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
2271 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2272 {
2273   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2274   IS                iscol=a->col,isrow=a->row;
2275   PetscErrorCode    ierr;
2276   const PetscInt    *r,*c,*rout,*cout;
2277   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2278   const MatScalar   *aa=a->a,*v;
2279   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2280   const PetscScalar *b;
2281 
2282   PetscFunctionBegin;
2283   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2285   t  = a->solve_work;
2286 
2287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2289 
2290   /* forward solve the lower triangular */
2291   idx    = 5*r[0];
2292   t[0] = b[idx];   t[1] = b[1+idx];
2293   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2294   for (i=1; i<n; i++) {
2295     v     = aa + 25*ai[i];
2296     vi    = aj + ai[i];
2297     nz    = ai[i+1] - ai[i];
2298     idx   = 5*r[i];
2299     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2300     s5  = b[4+idx];
2301     for(m=0;m<nz;m++){
2302       idx   = 5*vi[m];
2303       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2304       x4    = t[3+idx];x5 = t[4+idx];
2305       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2306       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2307       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2308       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2309       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2310       v += 25;
2311     }
2312     idx = 5*i;
2313     t[idx]   = s1;t[1+idx] = s2;
2314     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2315   }
2316   /* backward solve the upper triangular */
2317   for (i=n-1; i>=0; i--){
2318     v    = aa + 25*(adiag[i+1]+1);
2319     vi   = aj + adiag[i+1]+1;
2320     nz   = adiag[i] - adiag[i+1] - 1;
2321     idt  = 5*i;
2322     s1 = t[idt];  s2 = t[1+idt];
2323     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2324     for(m=0;m<nz;m++){
2325       idx   = 5*vi[m];
2326       x1    = t[idx];   x2 = t[1+idx];
2327       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2328       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2329       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2330       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2331       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2332       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2333       v += 25;
2334     }
2335     idc = 5*c[i];
2336     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2337                                  v[15]*s4+v[20]*s5;
2338     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2339                                  v[16]*s4+v[21]*s5;
2340     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2341                                  v[17]*s4+v[22]*s5;
2342     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2343                                  v[18]*s4+v[23]*s5;
2344     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2345                                  v[19]*s4+v[24]*s5;
2346   }
2347 
2348   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2349   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2350   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2351   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2352   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2353   PetscFunctionReturn(0);
2354 }
2355 
2356 #undef __FUNCT__
2357 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2358 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2359 {
2360   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2361   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2362   PetscErrorCode    ierr;
2363   PetscInt          *diag = a->diag,jdx;
2364   const MatScalar   *aa=a->a,*v;
2365   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2366   const PetscScalar *b;
2367 
2368   PetscFunctionBegin;
2369   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2370   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2371   /* forward solve the lower triangular */
2372   idx    = 0;
2373   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2374   for (i=1; i<n; i++) {
2375     v     =  aa + 25*ai[i];
2376     vi    =  aj + ai[i];
2377     nz    =  diag[i] - ai[i];
2378     idx   =  5*i;
2379     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2380     while (nz--) {
2381       jdx   = 5*(*vi++);
2382       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2383       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2384       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2385       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2386       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2387       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2388       v    += 25;
2389     }
2390     x[idx]   = s1;
2391     x[1+idx] = s2;
2392     x[2+idx] = s3;
2393     x[3+idx] = s4;
2394     x[4+idx] = s5;
2395   }
2396   /* backward solve the upper triangular */
2397   for (i=n-1; i>=0; i--){
2398     v    = aa + 25*diag[i] + 25;
2399     vi   = aj + diag[i] + 1;
2400     nz   = ai[i+1] - diag[i] - 1;
2401     idt  = 5*i;
2402     s1 = x[idt];  s2 = x[1+idt];
2403     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2404     while (nz--) {
2405       idx   = 5*(*vi++);
2406       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2407       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2408       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2409       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2410       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2411       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2412       v    += 25;
2413     }
2414     v        = aa + 25*diag[i];
2415     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2416     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2417     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2418     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2419     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2420   }
2421 
2422   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2423   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2424   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2425   PetscFunctionReturn(0);
2426 }
2427 
2428 #undef __FUNCT__
2429 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2430 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2431 {
2432   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2433   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2434   PetscErrorCode    ierr;
2435   PetscInt          jdx;
2436   const MatScalar   *aa=a->a,*v;
2437   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2438   const PetscScalar *b;
2439 
2440   PetscFunctionBegin;
2441   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2442   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2443   /* forward solve the lower triangular */
2444   idx    = 0;
2445   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2446   for (i=1; i<n; i++) {
2447     v   = aa + 25*ai[i];
2448     vi  = aj + ai[i];
2449     nz  = ai[i+1] - ai[i];
2450     idx = 5*i;
2451     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2452     for(k=0;k<nz;k++) {
2453       jdx   = 5*vi[k];
2454       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2455       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2456       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2457       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2458       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2459       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2460       v    += 25;
2461     }
2462     x[idx]   = s1;
2463     x[1+idx] = s2;
2464     x[2+idx] = s3;
2465     x[3+idx] = s4;
2466     x[4+idx] = s5;
2467   }
2468 
2469   /* backward solve the upper triangular */
2470   for (i=n-1; i>=0; i--){
2471     v   = aa + 25*ai[2*n-i];
2472     vi  = aj + ai[2*n-i];
2473     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2474     idt = 5*i;
2475     s1 = x[idt];  s2 = x[1+idt];
2476     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2477     for(k=0;k<nz;k++){
2478       idx   = 5*vi[k];
2479       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2480       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2481       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2482       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2483       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2484       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2485       v    += 25;
2486     }
2487     /* x = inv_diagonal*x */
2488     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2489     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2490     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2491     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2492     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2493   }
2494 
2495   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2496   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2497   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2498   PetscFunctionReturn(0);
2499 }
2500 
2501 #undef __FUNCT__
2502 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
2503 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2504 {
2505   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2506   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2507   PetscErrorCode    ierr;
2508   PetscInt          jdx;
2509   const MatScalar   *aa=a->a,*v;
2510   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2511   const PetscScalar *b;
2512 
2513   PetscFunctionBegin;
2514   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2515   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2516   /* forward solve the lower triangular */
2517   idx    = 0;
2518   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2519   for (i=1; i<n; i++) {
2520     v   = aa + 25*ai[i];
2521     vi  = aj + ai[i];
2522     nz  = ai[i+1] - ai[i];
2523     idx = 5*i;
2524     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2525     for(k=0;k<nz;k++) {
2526       jdx   = 5*vi[k];
2527       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2528       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2529       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2530       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2531       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2532       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2533       v    += 25;
2534     }
2535     x[idx]   = s1;
2536     x[1+idx] = s2;
2537     x[2+idx] = s3;
2538     x[3+idx] = s4;
2539     x[4+idx] = s5;
2540   }
2541 
2542   /* backward solve the upper triangular */
2543   for (i=n-1; i>=0; i--){
2544     v   = aa + 25*(adiag[i+1]+1);
2545     vi  = aj + adiag[i+1]+1;
2546     nz  = adiag[i] - adiag[i+1]-1;
2547     idt = 5*i;
2548     s1 = x[idt];  s2 = x[1+idt];
2549     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2550     for(k=0;k<nz;k++){
2551       idx   = 5*vi[k];
2552       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2553       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2554       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2555       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2556       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2557       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2558       v    += 25;
2559     }
2560     /* x = inv_diagonal*x */
2561     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2562     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2563     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2564     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2565     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2566   }
2567 
2568   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2569   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2570   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2571   PetscFunctionReturn(0);
2572 }
2573 
2574 #undef __FUNCT__
2575 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2576 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2577 {
2578   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2579   IS                iscol=a->col,isrow=a->row;
2580   PetscErrorCode    ierr;
2581   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2582   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2583   const MatScalar   *aa=a->a,*v;
2584   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2585   const PetscScalar *b;
2586 
2587   PetscFunctionBegin;
2588   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2589   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2590   t  = a->solve_work;
2591 
2592   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2593   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2594 
2595   /* forward solve the lower triangular */
2596   idx    = 4*(*r++);
2597   t[0] = b[idx];   t[1] = b[1+idx];
2598   t[2] = b[2+idx]; t[3] = b[3+idx];
2599   for (i=1; i<n; i++) {
2600     v     = aa + 16*ai[i];
2601     vi    = aj + ai[i];
2602     nz    = diag[i] - ai[i];
2603     idx   = 4*(*r++);
2604     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2605     while (nz--) {
2606       idx   = 4*(*vi++);
2607       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2608       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2609       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2610       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2611       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2612       v    += 16;
2613     }
2614     idx        = 4*i;
2615     t[idx]   = s1;t[1+idx] = s2;
2616     t[2+idx] = s3;t[3+idx] = s4;
2617   }
2618   /* backward solve the upper triangular */
2619   for (i=n-1; i>=0; i--){
2620     v    = aa + 16*diag[i] + 16;
2621     vi   = aj + diag[i] + 1;
2622     nz   = ai[i+1] - diag[i] - 1;
2623     idt  = 4*i;
2624     s1 = t[idt];  s2 = t[1+idt];
2625     s3 = t[2+idt];s4 = t[3+idt];
2626     while (nz--) {
2627       idx   = 4*(*vi++);
2628       x1    = t[idx];   x2 = t[1+idx];
2629       x3    = t[2+idx]; x4 = t[3+idx];
2630       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2631       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2632       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2633       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2634       v += 16;
2635     }
2636     idc      = 4*(*c--);
2637     v        = aa + 16*diag[i];
2638     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2639     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2640     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2641     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2642   }
2643 
2644   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2645   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2646   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2647   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2648   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2649   PetscFunctionReturn(0);
2650 }
2651 
2652 #undef __FUNCT__
2653 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2654 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2655 {
2656   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2657   IS                iscol=a->col,isrow=a->row;
2658   PetscErrorCode    ierr;
2659   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
2660   const PetscInt    *r,*c,*rout,*cout;
2661   const MatScalar   *aa=a->a,*v;
2662   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2663   const PetscScalar *b;
2664 
2665   PetscFunctionBegin;
2666   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2667   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2668   t  = a->solve_work;
2669 
2670   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2671   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2672 
2673   /* forward solve the lower triangular */
2674   idx    = 4*r[0];
2675   t[0] = b[idx];   t[1] = b[1+idx];
2676   t[2] = b[2+idx]; t[3] = b[3+idx];
2677   for (i=1; i<n; i++) {
2678     v     = aa + 16*ai[i];
2679     vi    = aj + ai[i];
2680     nz    = ai[i+1] - ai[i];
2681     idx   = 4*r[i];
2682     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2683     for(m=0;m<nz;m++){
2684       idx   = 4*vi[m];
2685       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2686       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2687       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2688       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2689       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2690       v    += 16;
2691     }
2692     idx        = 4*i;
2693     t[idx]   = s1;t[1+idx] = s2;
2694     t[2+idx] = s3;t[3+idx] = s4;
2695   }
2696   /* backward solve the upper triangular */
2697   for (i=n-1; i>=0; i--){
2698     k    = 2*n-i;
2699     v    = aa + 16*ai[k];
2700     vi   = aj + ai[k];
2701     nz   = ai[k+1] - ai[k] - 1;
2702     idt  = 4*i;
2703     s1 = t[idt];  s2 = t[1+idt];
2704     s3 = t[2+idt];s4 = t[3+idt];
2705     for(m=0;m<nz;m++){
2706       idx   = 4*vi[m];
2707       x1    = t[idx];   x2 = t[1+idx];
2708       x3    = t[2+idx]; x4 = t[3+idx];
2709       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2710       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2711       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2712       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2713       v += 16;
2714     }
2715     idc      = 4*c[i];
2716     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2717     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2718     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2719     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2720   }
2721 
2722   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2723   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2724   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2726   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2727   PetscFunctionReturn(0);
2728 }
2729 
2730 #undef __FUNCT__
2731 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
2732 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2733 {
2734   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2735   IS                iscol=a->col,isrow=a->row;
2736   PetscErrorCode    ierr;
2737   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2738   const PetscInt    *r,*c,*rout,*cout;
2739   const MatScalar   *aa=a->a,*v;
2740   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2741   const PetscScalar *b;
2742 
2743   PetscFunctionBegin;
2744   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2746   t  = a->solve_work;
2747 
2748   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2749   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2750 
2751   /* forward solve the lower triangular */
2752   idx    = 4*r[0];
2753   t[0] = b[idx];   t[1] = b[1+idx];
2754   t[2] = b[2+idx]; t[3] = b[3+idx];
2755   for (i=1; i<n; i++) {
2756     v     = aa + 16*ai[i];
2757     vi    = aj + ai[i];
2758     nz    = ai[i+1] - ai[i];
2759     idx   = 4*r[i];
2760     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2761     for(m=0;m<nz;m++){
2762       idx   = 4*vi[m];
2763       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2764       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2765       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2766       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2767       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2768       v    += 16;
2769     }
2770     idx        = 4*i;
2771     t[idx]   = s1;t[1+idx] = s2;
2772     t[2+idx] = s3;t[3+idx] = s4;
2773   }
2774   /* backward solve the upper triangular */
2775   for (i=n-1; i>=0; i--){
2776     v    = aa + 16*(adiag[i+1]+1);
2777     vi   = aj + adiag[i+1]+1;
2778     nz   = adiag[i] - adiag[i+1] - 1;
2779     idt  = 4*i;
2780     s1 = t[idt];  s2 = t[1+idt];
2781     s3 = t[2+idt];s4 = t[3+idt];
2782     for(m=0;m<nz;m++){
2783       idx   = 4*vi[m];
2784       x1    = t[idx];   x2 = t[1+idx];
2785       x3    = t[2+idx]; x4 = t[3+idx];
2786       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2787       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2788       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2789       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2790       v += 16;
2791     }
2792     idc      = 4*c[i];
2793     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2794     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2795     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2796     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2797   }
2798 
2799   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2800   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2801   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2802   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2803   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2804   PetscFunctionReturn(0);
2805 }
2806 
2807 #undef __FUNCT__
2808 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2809 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2810 {
2811   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2812   IS                iscol=a->col,isrow=a->row;
2813   PetscErrorCode    ierr;
2814   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2815   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2816   const MatScalar   *aa=a->a,*v;
2817   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2818   PetscScalar       *x;
2819   const PetscScalar *b;
2820 
2821   PetscFunctionBegin;
2822   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2823   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2824   t  = (MatScalar *)a->solve_work;
2825 
2826   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2827   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2828 
2829   /* forward solve the lower triangular */
2830   idx    = 4*(*r++);
2831   t[0] = (MatScalar)b[idx];
2832   t[1] = (MatScalar)b[1+idx];
2833   t[2] = (MatScalar)b[2+idx];
2834   t[3] = (MatScalar)b[3+idx];
2835   for (i=1; i<n; i++) {
2836     v     = aa + 16*ai[i];
2837     vi    = aj + ai[i];
2838     nz    = diag[i] - ai[i];
2839     idx   = 4*(*r++);
2840     s1 = (MatScalar)b[idx];
2841     s2 = (MatScalar)b[1+idx];
2842     s3 = (MatScalar)b[2+idx];
2843     s4 = (MatScalar)b[3+idx];
2844     while (nz--) {
2845       idx   = 4*(*vi++);
2846       x1  = t[idx];
2847       x2  = t[1+idx];
2848       x3  = t[2+idx];
2849       x4  = t[3+idx];
2850       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2851       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2852       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2853       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2854       v    += 16;
2855     }
2856     idx        = 4*i;
2857     t[idx]   = s1;
2858     t[1+idx] = s2;
2859     t[2+idx] = s3;
2860     t[3+idx] = s4;
2861   }
2862   /* backward solve the upper triangular */
2863   for (i=n-1; i>=0; i--){
2864     v    = aa + 16*diag[i] + 16;
2865     vi   = aj + diag[i] + 1;
2866     nz   = ai[i+1] - diag[i] - 1;
2867     idt  = 4*i;
2868     s1 = t[idt];
2869     s2 = t[1+idt];
2870     s3 = t[2+idt];
2871     s4 = t[3+idt];
2872     while (nz--) {
2873       idx   = 4*(*vi++);
2874       x1  = t[idx];
2875       x2  = t[1+idx];
2876       x3  = t[2+idx];
2877       x4  = t[3+idx];
2878       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2879       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2880       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2881       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2882       v += 16;
2883     }
2884     idc      = 4*(*c--);
2885     v        = aa + 16*diag[i];
2886     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2887     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2888     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2889     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2890     x[idc]   = (PetscScalar)t[idt];
2891     x[1+idc] = (PetscScalar)t[1+idt];
2892     x[2+idc] = (PetscScalar)t[2+idt];
2893     x[3+idc] = (PetscScalar)t[3+idt];
2894  }
2895 
2896   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2897   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2898   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2899   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2900   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2901   PetscFunctionReturn(0);
2902 }
2903 
2904 #if defined (PETSC_HAVE_SSE)
2905 
2906 #include PETSC_HAVE_SSE
2907 
2908 #undef __FUNCT__
2909 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2910 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2911 {
2912   /*
2913      Note: This code uses demotion of double
2914      to float when performing the mixed-mode computation.
2915      This may not be numerically reasonable for all applications.
2916   */
2917   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2918   IS             iscol=a->col,isrow=a->row;
2919   PetscErrorCode ierr;
2920   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2921   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2922   MatScalar      *aa=a->a,*v;
2923   PetscScalar    *x,*b,*t;
2924 
2925   /* Make space in temp stack for 16 Byte Aligned arrays */
2926   float           ssealignedspace[11],*tmps,*tmpx;
2927   unsigned long   offset;
2928 
2929   PetscFunctionBegin;
2930   SSE_SCOPE_BEGIN;
2931 
2932     offset = (unsigned long)ssealignedspace % 16;
2933     if (offset) offset = (16 - offset)/4;
2934     tmps = &ssealignedspace[offset];
2935     tmpx = &ssealignedspace[offset+4];
2936     PREFETCH_NTA(aa+16*ai[1]);
2937 
2938     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2939     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2940     t  = a->solve_work;
2941 
2942     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2943     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2944 
2945     /* forward solve the lower triangular */
2946     idx  = 4*(*r++);
2947     t[0] = b[idx];   t[1] = b[1+idx];
2948     t[2] = b[2+idx]; t[3] = b[3+idx];
2949     v    =  aa + 16*ai[1];
2950 
2951     for (i=1; i<n;) {
2952       PREFETCH_NTA(&v[8]);
2953       vi   =  aj      + ai[i];
2954       nz   =  diag[i] - ai[i];
2955       idx  =  4*(*r++);
2956 
2957       /* Demote sum from double to float */
2958       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2959       LOAD_PS(tmps,XMM7);
2960 
2961       while (nz--) {
2962         PREFETCH_NTA(&v[16]);
2963         idx = 4*(*vi++);
2964 
2965         /* Demote solution (so far) from double to float */
2966         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2967 
2968         /* 4x4 Matrix-Vector product with negative accumulation: */
2969         SSE_INLINE_BEGIN_2(tmpx,v)
2970           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2971 
2972           /* First Column */
2973           SSE_COPY_PS(XMM0,XMM6)
2974           SSE_SHUFFLE(XMM0,XMM0,0x00)
2975           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2976           SSE_SUB_PS(XMM7,XMM0)
2977 
2978           /* Second Column */
2979           SSE_COPY_PS(XMM1,XMM6)
2980           SSE_SHUFFLE(XMM1,XMM1,0x55)
2981           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2982           SSE_SUB_PS(XMM7,XMM1)
2983 
2984           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2985 
2986           /* Third Column */
2987           SSE_COPY_PS(XMM2,XMM6)
2988           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2989           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2990           SSE_SUB_PS(XMM7,XMM2)
2991 
2992           /* Fourth Column */
2993           SSE_COPY_PS(XMM3,XMM6)
2994           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2995           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2996           SSE_SUB_PS(XMM7,XMM3)
2997         SSE_INLINE_END_2
2998 
2999         v  += 16;
3000       }
3001       idx = 4*i;
3002       v   = aa + 16*ai[++i];
3003       PREFETCH_NTA(v);
3004       STORE_PS(tmps,XMM7);
3005 
3006       /* Promote result from float to double */
3007       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3008     }
3009     /* backward solve the upper triangular */
3010     idt  = 4*(n-1);
3011     ai16 = 16*diag[n-1];
3012     v    = aa + ai16 + 16;
3013     for (i=n-1; i>=0;){
3014       PREFETCH_NTA(&v[8]);
3015       vi = aj + diag[i] + 1;
3016       nz = ai[i+1] - diag[i] - 1;
3017 
3018       /* Demote accumulator from double to float */
3019       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3020       LOAD_PS(tmps,XMM7);
3021 
3022       while (nz--) {
3023         PREFETCH_NTA(&v[16]);
3024         idx = 4*(*vi++);
3025 
3026         /* Demote solution (so far) from double to float */
3027         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3028 
3029         /* 4x4 Matrix-Vector Product with negative accumulation: */
3030         SSE_INLINE_BEGIN_2(tmpx,v)
3031           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3032 
3033           /* First Column */
3034           SSE_COPY_PS(XMM0,XMM6)
3035           SSE_SHUFFLE(XMM0,XMM0,0x00)
3036           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3037           SSE_SUB_PS(XMM7,XMM0)
3038 
3039           /* Second Column */
3040           SSE_COPY_PS(XMM1,XMM6)
3041           SSE_SHUFFLE(XMM1,XMM1,0x55)
3042           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3043           SSE_SUB_PS(XMM7,XMM1)
3044 
3045           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3046 
3047           /* Third Column */
3048           SSE_COPY_PS(XMM2,XMM6)
3049           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3050           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3051           SSE_SUB_PS(XMM7,XMM2)
3052 
3053           /* Fourth Column */
3054           SSE_COPY_PS(XMM3,XMM6)
3055           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3056           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3057           SSE_SUB_PS(XMM7,XMM3)
3058         SSE_INLINE_END_2
3059         v  += 16;
3060       }
3061       v    = aa + ai16;
3062       ai16 = 16*diag[--i];
3063       PREFETCH_NTA(aa+ai16+16);
3064       /*
3065          Scale the result by the diagonal 4x4 block,
3066          which was inverted as part of the factorization
3067       */
3068       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3069         /* First Column */
3070         SSE_COPY_PS(XMM0,XMM7)
3071         SSE_SHUFFLE(XMM0,XMM0,0x00)
3072         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3073 
3074         /* Second Column */
3075         SSE_COPY_PS(XMM1,XMM7)
3076         SSE_SHUFFLE(XMM1,XMM1,0x55)
3077         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3078         SSE_ADD_PS(XMM0,XMM1)
3079 
3080         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3081 
3082         /* Third Column */
3083         SSE_COPY_PS(XMM2,XMM7)
3084         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3085         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3086         SSE_ADD_PS(XMM0,XMM2)
3087 
3088         /* Fourth Column */
3089         SSE_COPY_PS(XMM3,XMM7)
3090         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3091         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3092         SSE_ADD_PS(XMM0,XMM3)
3093 
3094         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3095       SSE_INLINE_END_3
3096 
3097       /* Promote solution from float to double */
3098       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3099 
3100       /* Apply reordering to t and stream into x.    */
3101       /* This way, x doesn't pollute the cache.      */
3102       /* Be careful with size: 2 doubles = 4 floats! */
3103       idc  = 4*(*c--);
3104       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3105         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3106         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3107         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3108         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3109         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3110         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3111       SSE_INLINE_END_2
3112       v    = aa + ai16 + 16;
3113       idt -= 4;
3114     }
3115 
3116     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3117     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3118     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3119     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3120     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3121   SSE_SCOPE_END;
3122   PetscFunctionReturn(0);
3123 }
3124 
3125 #endif
3126 
3127 
3128 /*
3129       Special case where the matrix was ILU(0) factored in the natural
3130    ordering. This eliminates the need for the column and row permutation.
3131 */
3132 #undef __FUNCT__
3133 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3134 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3135 {
3136   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3137   PetscInt          n=a->mbs;
3138   const PetscInt    *ai=a->i,*aj=a->j;
3139   PetscErrorCode    ierr;
3140   const PetscInt    *diag = a->diag;
3141   const MatScalar   *aa=a->a;
3142   PetscScalar       *x;
3143   const PetscScalar *b;
3144 
3145   PetscFunctionBegin;
3146   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3147   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3148 
3149 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3150   {
3151     static PetscScalar w[2000]; /* very BAD need to fix */
3152     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3153   }
3154 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3155   {
3156     static PetscScalar w[2000]; /* very BAD need to fix */
3157     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3158   }
3159 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3160   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3161 #else
3162   {
3163     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3164     const MatScalar *v;
3165     PetscInt        jdx,idt,idx,nz,i,ai16;
3166     const PetscInt  *vi;
3167 
3168   /* forward solve the lower triangular */
3169   idx    = 0;
3170   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3171   for (i=1; i<n; i++) {
3172     v     =  aa      + 16*ai[i];
3173     vi    =  aj      + ai[i];
3174     nz    =  diag[i] - ai[i];
3175     idx   +=  4;
3176     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3177     while (nz--) {
3178       jdx   = 4*(*vi++);
3179       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3180       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3181       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3182       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3183       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3184       v    += 16;
3185     }
3186     x[idx]   = s1;
3187     x[1+idx] = s2;
3188     x[2+idx] = s3;
3189     x[3+idx] = s4;
3190   }
3191   /* backward solve the upper triangular */
3192   idt = 4*(n-1);
3193   for (i=n-1; i>=0; i--){
3194     ai16 = 16*diag[i];
3195     v    = aa + ai16 + 16;
3196     vi   = aj + diag[i] + 1;
3197     nz   = ai[i+1] - diag[i] - 1;
3198     s1 = x[idt];  s2 = x[1+idt];
3199     s3 = x[2+idt];s4 = x[3+idt];
3200     while (nz--) {
3201       idx   = 4*(*vi++);
3202       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3203       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3204       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3205       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3206       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3207       v    += 16;
3208     }
3209     v        = aa + ai16;
3210     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3211     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3212     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3213     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3214     idt -= 4;
3215   }
3216   }
3217 #endif
3218 
3219   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3220   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3221   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3222   PetscFunctionReturn(0);
3223 }
3224 
3225 #undef __FUNCT__
3226 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3227 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3228 {
3229     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3230     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3231     PetscErrorCode    ierr;
3232     PetscInt          idx,jdx,idt;
3233     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3234     const MatScalar   *aa=a->a,*v;
3235     PetscScalar       *x;
3236     const PetscScalar *b;
3237     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3238 
3239     PetscFunctionBegin;
3240     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3241     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3242     /* forward solve the lower triangular */
3243     idx    = 0;
3244     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3245     for (i=1; i<n; i++) {
3246        v    = aa + bs2*ai[i];
3247        vi   = aj + ai[i];
3248        nz   = ai[i+1] - ai[i];
3249       idx   = bs*i;
3250        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3251       for(k=0;k<nz;k++) {
3252           jdx   = bs*vi[k];
3253           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3254           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3255           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3256           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3257 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3258 
3259           v   +=  bs2;
3260         }
3261 
3262        x[idx]   = s1;
3263        x[1+idx] = s2;
3264        x[2+idx] = s3;
3265        x[3+idx] = s4;
3266     }
3267 
3268    /* backward solve the upper triangular */
3269   for (i=n-1; i>=0; i--){
3270      v   = aa + bs2*ai[2*n-i];
3271      vi  = aj + ai[2*n-i];
3272      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3273      idt = bs*i;
3274      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3275 
3276     for(k=0;k<nz;k++){
3277       idx   = bs*vi[k];
3278        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3279        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3280        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3281        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3282        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3283 
3284         v   +=  bs2;
3285     }
3286     /* x = inv_diagonal*x */
3287    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3288    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3289    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3290    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3291 
3292   }
3293 
3294   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3297   PetscFunctionReturn(0);
3298 }
3299 
3300 #undef __FUNCT__
3301 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3302 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3303 {
3304     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3305     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3306     PetscErrorCode    ierr;
3307     PetscInt          idx,jdx,idt;
3308     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3309     const MatScalar   *aa=a->a,*v;
3310     PetscScalar       *x;
3311     const PetscScalar *b;
3312     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3313 
3314     PetscFunctionBegin;
3315     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3316     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3317     /* forward solve the lower triangular */
3318     idx    = 0;
3319     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3320     for (i=1; i<n; i++) {
3321        v    = aa + bs2*ai[i];
3322        vi   = aj + ai[i];
3323        nz   = ai[i+1] - ai[i];
3324       idx   = bs*i;
3325        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3326       for(k=0;k<nz;k++) {
3327           jdx   = bs*vi[k];
3328           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3329           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3330           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3331           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3332 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3333 
3334           v   +=  bs2;
3335         }
3336 
3337        x[idx]   = s1;
3338        x[1+idx] = s2;
3339        x[2+idx] = s3;
3340        x[3+idx] = s4;
3341     }
3342 
3343    /* backward solve the upper triangular */
3344   for (i=n-1; i>=0; i--){
3345     v   = aa + bs2*(adiag[i+1]+1);
3346      vi  = aj + adiag[i+1]+1;
3347      nz  = adiag[i] - adiag[i+1]-1;
3348      idt = bs*i;
3349      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3350 
3351     for(k=0;k<nz;k++){
3352       idx   = bs*vi[k];
3353        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3354        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3355        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3356        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3357        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3358 
3359         v   +=  bs2;
3360     }
3361     /* x = inv_diagonal*x */
3362    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3363    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3364    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3365    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3366 
3367   }
3368 
3369   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3370   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3371   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3372   PetscFunctionReturn(0);
3373 }
3374 
3375 #undef __FUNCT__
3376 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3377 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3378 {
3379   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3380   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3381   PetscErrorCode ierr;
3382   PetscInt       *diag = a->diag;
3383   MatScalar      *aa=a->a;
3384   PetscScalar    *x,*b;
3385 
3386   PetscFunctionBegin;
3387   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3388   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3389 
3390   {
3391     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3392     MatScalar  *v,*t=(MatScalar *)x;
3393     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3394 
3395     /* forward solve the lower triangular */
3396     idx  = 0;
3397     t[0] = (MatScalar)b[0];
3398     t[1] = (MatScalar)b[1];
3399     t[2] = (MatScalar)b[2];
3400     t[3] = (MatScalar)b[3];
3401     for (i=1; i<n; i++) {
3402       v     =  aa      + 16*ai[i];
3403       vi    =  aj      + ai[i];
3404       nz    =  diag[i] - ai[i];
3405       idx   +=  4;
3406       s1 = (MatScalar)b[idx];
3407       s2 = (MatScalar)b[1+idx];
3408       s3 = (MatScalar)b[2+idx];
3409       s4 = (MatScalar)b[3+idx];
3410       while (nz--) {
3411         jdx = 4*(*vi++);
3412         x1  = t[jdx];
3413         x2  = t[1+jdx];
3414         x3  = t[2+jdx];
3415         x4  = t[3+jdx];
3416         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3417         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3418         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3419         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3420         v    += 16;
3421       }
3422       t[idx]   = s1;
3423       t[1+idx] = s2;
3424       t[2+idx] = s3;
3425       t[3+idx] = s4;
3426     }
3427     /* backward solve the upper triangular */
3428     idt = 4*(n-1);
3429     for (i=n-1; i>=0; i--){
3430       ai16 = 16*diag[i];
3431       v    = aa + ai16 + 16;
3432       vi   = aj + diag[i] + 1;
3433       nz   = ai[i+1] - diag[i] - 1;
3434       s1   = t[idt];
3435       s2   = t[1+idt];
3436       s3   = t[2+idt];
3437       s4   = t[3+idt];
3438       while (nz--) {
3439         idx = 4*(*vi++);
3440         x1  = (MatScalar)x[idx];
3441         x2  = (MatScalar)x[1+idx];
3442         x3  = (MatScalar)x[2+idx];
3443         x4  = (MatScalar)x[3+idx];
3444         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3445         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3446         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3447         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3448         v    += 16;
3449       }
3450       v        = aa + ai16;
3451       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3452       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3453       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3454       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3455       idt -= 4;
3456     }
3457   }
3458 
3459   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3460   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3461   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3462   PetscFunctionReturn(0);
3463 }
3464 
3465 #if defined (PETSC_HAVE_SSE)
3466 
3467 #include PETSC_HAVE_SSE
3468 #undef __FUNCT__
3469 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3470 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3471 {
3472   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3473   unsigned short *aj=(unsigned short *)a->j;
3474   PetscErrorCode ierr;
3475   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3476   MatScalar      *aa=a->a;
3477   PetscScalar    *x,*b;
3478 
3479   PetscFunctionBegin;
3480   SSE_SCOPE_BEGIN;
3481   /*
3482      Note: This code currently uses demotion of double
3483      to float when performing the mixed-mode computation.
3484      This may not be numerically reasonable for all applications.
3485   */
3486   PREFETCH_NTA(aa+16*ai[1]);
3487 
3488   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3489   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3490   {
3491     /* x will first be computed in single precision then promoted inplace to double */
3492     MatScalar      *v,*t=(MatScalar *)x;
3493     int            nz,i,idt,ai16;
3494     unsigned int   jdx,idx;
3495     unsigned short *vi;
3496     /* Forward solve the lower triangular factor. */
3497 
3498     /* First block is the identity. */
3499     idx  = 0;
3500     CONVERT_DOUBLE4_FLOAT4(t,b);
3501     v    =  aa + 16*((unsigned int)ai[1]);
3502 
3503     for (i=1; i<n;) {
3504       PREFETCH_NTA(&v[8]);
3505       vi   =  aj      + ai[i];
3506       nz   =  diag[i] - ai[i];
3507       idx +=  4;
3508 
3509       /* Demote RHS from double to float. */
3510       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3511       LOAD_PS(&t[idx],XMM7);
3512 
3513       while (nz--) {
3514         PREFETCH_NTA(&v[16]);
3515         jdx = 4*((unsigned int)(*vi++));
3516 
3517         /* 4x4 Matrix-Vector product with negative accumulation: */
3518         SSE_INLINE_BEGIN_2(&t[jdx],v)
3519           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3520 
3521           /* First Column */
3522           SSE_COPY_PS(XMM0,XMM6)
3523           SSE_SHUFFLE(XMM0,XMM0,0x00)
3524           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3525           SSE_SUB_PS(XMM7,XMM0)
3526 
3527           /* Second Column */
3528           SSE_COPY_PS(XMM1,XMM6)
3529           SSE_SHUFFLE(XMM1,XMM1,0x55)
3530           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3531           SSE_SUB_PS(XMM7,XMM1)
3532 
3533           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3534 
3535           /* Third Column */
3536           SSE_COPY_PS(XMM2,XMM6)
3537           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3538           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3539           SSE_SUB_PS(XMM7,XMM2)
3540 
3541           /* Fourth Column */
3542           SSE_COPY_PS(XMM3,XMM6)
3543           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3544           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3545           SSE_SUB_PS(XMM7,XMM3)
3546         SSE_INLINE_END_2
3547 
3548         v  += 16;
3549       }
3550       v    =  aa + 16*ai[++i];
3551       PREFETCH_NTA(v);
3552       STORE_PS(&t[idx],XMM7);
3553     }
3554 
3555     /* Backward solve the upper triangular factor.*/
3556 
3557     idt  = 4*(n-1);
3558     ai16 = 16*diag[n-1];
3559     v    = aa + ai16 + 16;
3560     for (i=n-1; i>=0;){
3561       PREFETCH_NTA(&v[8]);
3562       vi = aj + diag[i] + 1;
3563       nz = ai[i+1] - diag[i] - 1;
3564 
3565       LOAD_PS(&t[idt],XMM7);
3566 
3567       while (nz--) {
3568         PREFETCH_NTA(&v[16]);
3569         idx = 4*((unsigned int)(*vi++));
3570 
3571         /* 4x4 Matrix-Vector Product with negative accumulation: */
3572         SSE_INLINE_BEGIN_2(&t[idx],v)
3573           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3574 
3575           /* First Column */
3576           SSE_COPY_PS(XMM0,XMM6)
3577           SSE_SHUFFLE(XMM0,XMM0,0x00)
3578           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3579           SSE_SUB_PS(XMM7,XMM0)
3580 
3581           /* Second Column */
3582           SSE_COPY_PS(XMM1,XMM6)
3583           SSE_SHUFFLE(XMM1,XMM1,0x55)
3584           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3585           SSE_SUB_PS(XMM7,XMM1)
3586 
3587           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3588 
3589           /* Third Column */
3590           SSE_COPY_PS(XMM2,XMM6)
3591           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3592           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3593           SSE_SUB_PS(XMM7,XMM2)
3594 
3595           /* Fourth Column */
3596           SSE_COPY_PS(XMM3,XMM6)
3597           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3598           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3599           SSE_SUB_PS(XMM7,XMM3)
3600         SSE_INLINE_END_2
3601         v  += 16;
3602       }
3603       v    = aa + ai16;
3604       ai16 = 16*diag[--i];
3605       PREFETCH_NTA(aa+ai16+16);
3606       /*
3607          Scale the result by the diagonal 4x4 block,
3608          which was inverted as part of the factorization
3609       */
3610       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3611         /* First Column */
3612         SSE_COPY_PS(XMM0,XMM7)
3613         SSE_SHUFFLE(XMM0,XMM0,0x00)
3614         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3615 
3616         /* Second Column */
3617         SSE_COPY_PS(XMM1,XMM7)
3618         SSE_SHUFFLE(XMM1,XMM1,0x55)
3619         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3620         SSE_ADD_PS(XMM0,XMM1)
3621 
3622         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3623 
3624         /* Third Column */
3625         SSE_COPY_PS(XMM2,XMM7)
3626         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3627         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3628         SSE_ADD_PS(XMM0,XMM2)
3629 
3630         /* Fourth Column */
3631         SSE_COPY_PS(XMM3,XMM7)
3632         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3633         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3634         SSE_ADD_PS(XMM0,XMM3)
3635 
3636         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3637       SSE_INLINE_END_3
3638 
3639       v    = aa + ai16 + 16;
3640       idt -= 4;
3641     }
3642 
3643     /* Convert t from single precision back to double precision (inplace)*/
3644     idt = 4*(n-1);
3645     for (i=n-1;i>=0;i--) {
3646       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3647       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3648       PetscScalar *xtemp=&x[idt];
3649       MatScalar   *ttemp=&t[idt];
3650       xtemp[3] = (PetscScalar)ttemp[3];
3651       xtemp[2] = (PetscScalar)ttemp[2];
3652       xtemp[1] = (PetscScalar)ttemp[1];
3653       xtemp[0] = (PetscScalar)ttemp[0];
3654       idt -= 4;
3655     }
3656 
3657   } /* End of artificial scope. */
3658   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3659   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3660   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3661   SSE_SCOPE_END;
3662   PetscFunctionReturn(0);
3663 }
3664 
3665 #undef __FUNCT__
3666 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3667 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3668 {
3669   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3670   int            *aj=a->j;
3671   PetscErrorCode ierr;
3672   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3673   MatScalar      *aa=a->a;
3674   PetscScalar    *x,*b;
3675 
3676   PetscFunctionBegin;
3677   SSE_SCOPE_BEGIN;
3678   /*
3679      Note: This code currently uses demotion of double
3680      to float when performing the mixed-mode computation.
3681      This may not be numerically reasonable for all applications.
3682   */
3683   PREFETCH_NTA(aa+16*ai[1]);
3684 
3685   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3686   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3687   {
3688     /* x will first be computed in single precision then promoted inplace to double */
3689     MatScalar *v,*t=(MatScalar *)x;
3690     int       nz,i,idt,ai16;
3691     int       jdx,idx;
3692     int       *vi;
3693     /* Forward solve the lower triangular factor. */
3694 
3695     /* First block is the identity. */
3696     idx  = 0;
3697     CONVERT_DOUBLE4_FLOAT4(t,b);
3698     v    =  aa + 16*ai[1];
3699 
3700     for (i=1; i<n;) {
3701       PREFETCH_NTA(&v[8]);
3702       vi   =  aj      + ai[i];
3703       nz   =  diag[i] - ai[i];
3704       idx +=  4;
3705 
3706       /* Demote RHS from double to float. */
3707       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3708       LOAD_PS(&t[idx],XMM7);
3709 
3710       while (nz--) {
3711         PREFETCH_NTA(&v[16]);
3712         jdx = 4*(*vi++);
3713 /*          jdx = *vi++; */
3714 
3715         /* 4x4 Matrix-Vector product with negative accumulation: */
3716         SSE_INLINE_BEGIN_2(&t[jdx],v)
3717           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3718 
3719           /* First Column */
3720           SSE_COPY_PS(XMM0,XMM6)
3721           SSE_SHUFFLE(XMM0,XMM0,0x00)
3722           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3723           SSE_SUB_PS(XMM7,XMM0)
3724 
3725           /* Second Column */
3726           SSE_COPY_PS(XMM1,XMM6)
3727           SSE_SHUFFLE(XMM1,XMM1,0x55)
3728           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3729           SSE_SUB_PS(XMM7,XMM1)
3730 
3731           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3732 
3733           /* Third Column */
3734           SSE_COPY_PS(XMM2,XMM6)
3735           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3736           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3737           SSE_SUB_PS(XMM7,XMM2)
3738 
3739           /* Fourth Column */
3740           SSE_COPY_PS(XMM3,XMM6)
3741           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3742           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3743           SSE_SUB_PS(XMM7,XMM3)
3744         SSE_INLINE_END_2
3745 
3746         v  += 16;
3747       }
3748       v    =  aa + 16*ai[++i];
3749       PREFETCH_NTA(v);
3750       STORE_PS(&t[idx],XMM7);
3751     }
3752 
3753     /* Backward solve the upper triangular factor.*/
3754 
3755     idt  = 4*(n-1);
3756     ai16 = 16*diag[n-1];
3757     v    = aa + ai16 + 16;
3758     for (i=n-1; i>=0;){
3759       PREFETCH_NTA(&v[8]);
3760       vi = aj + diag[i] + 1;
3761       nz = ai[i+1] - diag[i] - 1;
3762 
3763       LOAD_PS(&t[idt],XMM7);
3764 
3765       while (nz--) {
3766         PREFETCH_NTA(&v[16]);
3767         idx = 4*(*vi++);
3768 /*          idx = *vi++; */
3769 
3770         /* 4x4 Matrix-Vector Product with negative accumulation: */
3771         SSE_INLINE_BEGIN_2(&t[idx],v)
3772           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3773 
3774           /* First Column */
3775           SSE_COPY_PS(XMM0,XMM6)
3776           SSE_SHUFFLE(XMM0,XMM0,0x00)
3777           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3778           SSE_SUB_PS(XMM7,XMM0)
3779 
3780           /* Second Column */
3781           SSE_COPY_PS(XMM1,XMM6)
3782           SSE_SHUFFLE(XMM1,XMM1,0x55)
3783           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3784           SSE_SUB_PS(XMM7,XMM1)
3785 
3786           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3787 
3788           /* Third Column */
3789           SSE_COPY_PS(XMM2,XMM6)
3790           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3791           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3792           SSE_SUB_PS(XMM7,XMM2)
3793 
3794           /* Fourth Column */
3795           SSE_COPY_PS(XMM3,XMM6)
3796           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3797           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3798           SSE_SUB_PS(XMM7,XMM3)
3799         SSE_INLINE_END_2
3800         v  += 16;
3801       }
3802       v    = aa + ai16;
3803       ai16 = 16*diag[--i];
3804       PREFETCH_NTA(aa+ai16+16);
3805       /*
3806          Scale the result by the diagonal 4x4 block,
3807          which was inverted as part of the factorization
3808       */
3809       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3810         /* First Column */
3811         SSE_COPY_PS(XMM0,XMM7)
3812         SSE_SHUFFLE(XMM0,XMM0,0x00)
3813         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3814 
3815         /* Second Column */
3816         SSE_COPY_PS(XMM1,XMM7)
3817         SSE_SHUFFLE(XMM1,XMM1,0x55)
3818         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3819         SSE_ADD_PS(XMM0,XMM1)
3820 
3821         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3822 
3823         /* Third Column */
3824         SSE_COPY_PS(XMM2,XMM7)
3825         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3826         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3827         SSE_ADD_PS(XMM0,XMM2)
3828 
3829         /* Fourth Column */
3830         SSE_COPY_PS(XMM3,XMM7)
3831         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3832         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3833         SSE_ADD_PS(XMM0,XMM3)
3834 
3835         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3836       SSE_INLINE_END_3
3837 
3838       v    = aa + ai16 + 16;
3839       idt -= 4;
3840     }
3841 
3842     /* Convert t from single precision back to double precision (inplace)*/
3843     idt = 4*(n-1);
3844     for (i=n-1;i>=0;i--) {
3845       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3846       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3847       PetscScalar *xtemp=&x[idt];
3848       MatScalar   *ttemp=&t[idt];
3849       xtemp[3] = (PetscScalar)ttemp[3];
3850       xtemp[2] = (PetscScalar)ttemp[2];
3851       xtemp[1] = (PetscScalar)ttemp[1];
3852       xtemp[0] = (PetscScalar)ttemp[0];
3853       idt -= 4;
3854     }
3855 
3856   } /* End of artificial scope. */
3857   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3858   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3859   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3860   SSE_SCOPE_END;
3861   PetscFunctionReturn(0);
3862 }
3863 
3864 #endif
3865 
3866 #undef __FUNCT__
3867 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3868 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3869 {
3870   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3871   IS                iscol=a->col,isrow=a->row;
3872   PetscErrorCode    ierr;
3873   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3874   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3875   const MatScalar   *aa=a->a,*v;
3876   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3877   const PetscScalar *b;
3878 
3879   PetscFunctionBegin;
3880   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3881   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3882   t  = a->solve_work;
3883 
3884   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3885   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3886 
3887   /* forward solve the lower triangular */
3888   idx    = 3*(*r++);
3889   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3890   for (i=1; i<n; i++) {
3891     v     = aa + 9*ai[i];
3892     vi    = aj + ai[i];
3893     nz    = diag[i] - ai[i];
3894     idx   = 3*(*r++);
3895     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3896     while (nz--) {
3897       idx   = 3*(*vi++);
3898       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3899       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3900       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3901       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3902       v += 9;
3903     }
3904     idx = 3*i;
3905     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3906   }
3907   /* backward solve the upper triangular */
3908   for (i=n-1; i>=0; i--){
3909     v    = aa + 9*diag[i] + 9;
3910     vi   = aj + diag[i] + 1;
3911     nz   = ai[i+1] - diag[i] - 1;
3912     idt  = 3*i;
3913     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3914     while (nz--) {
3915       idx   = 3*(*vi++);
3916       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3917       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3918       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3919       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3920       v += 9;
3921     }
3922     idc = 3*(*c--);
3923     v   = aa + 9*diag[i];
3924     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3925     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3926     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3927   }
3928   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3929   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3930   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3932   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3933   PetscFunctionReturn(0);
3934 }
3935 
3936 #undef __FUNCT__
3937 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3938 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
3939 {
3940   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3941   IS                iscol=a->col,isrow=a->row;
3942   PetscErrorCode    ierr;
3943   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
3944   const PetscInt    *r,*c,*rout,*cout;
3945   const MatScalar   *aa=a->a,*v;
3946   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3947   const PetscScalar *b;
3948 
3949   PetscFunctionBegin;
3950   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3951   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3952   t  = a->solve_work;
3953 
3954   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3955   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3956 
3957   /* forward solve the lower triangular */
3958   idx    = 3*r[0];
3959   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3960   for (i=1; i<n; i++) {
3961     v     = aa + 9*ai[i];
3962     vi    = aj + ai[i];
3963     nz    = ai[i+1] - ai[i];
3964     idx   = 3*r[i];
3965     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3966     for(m=0;m<nz;m++){
3967       idx   = 3*vi[m];
3968       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3969       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3970       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3971       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3972       v += 9;
3973     }
3974     idx = 3*i;
3975     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3976   }
3977   /* backward solve the upper triangular */
3978   for (i=n-1; i>=0; i--){
3979     k    = 2*n-i;
3980     v    = aa + 9*ai[k];
3981     vi   = aj + ai[k];
3982     nz   = ai[k +1] - ai[k] - 1;
3983     idt  = 3*i;
3984     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3985     for(m=0;m<nz;m++){
3986       idx   = 3*vi[m];
3987       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3988       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3989       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3990       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3991       v += 9;
3992     }
3993     idc = 3*c[i];
3994     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3995     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3996     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3997   }
3998   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3999   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4000   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4001   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4002   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4003   PetscFunctionReturn(0);
4004 }
4005 
4006 #undef __FUNCT__
4007 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
4008 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4009 {
4010   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4011   IS                iscol=a->col,isrow=a->row;
4012   PetscErrorCode    ierr;
4013   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4014   const PetscInt    *r,*c,*rout,*cout;
4015   const MatScalar   *aa=a->a,*v;
4016   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4017   const PetscScalar *b;
4018 
4019   PetscFunctionBegin;
4020   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4021   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4022   t  = a->solve_work;
4023 
4024   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4025   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4026 
4027   /* forward solve the lower triangular */
4028   idx    = 3*r[0];
4029   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4030   for (i=1; i<n; i++) {
4031     v     = aa + 9*ai[i];
4032     vi    = aj + ai[i];
4033     nz    = ai[i+1] - ai[i];
4034     idx   = 3*r[i];
4035     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4036     for(m=0;m<nz;m++){
4037       idx   = 3*vi[m];
4038       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4039       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4040       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4041       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4042       v += 9;
4043     }
4044     idx = 3*i;
4045     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4046   }
4047   /* backward solve the upper triangular */
4048   for (i=n-1; i>=0; i--){
4049     v    = aa + 9*(adiag[i+1]+1);
4050     vi   = aj + adiag[i+1]+1;
4051     nz   = adiag[i] - adiag[i+1] - 1;
4052     idt  = 3*i;
4053     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4054     for(m=0;m<nz;m++){
4055       idx   = 3*vi[m];
4056       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4057       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4058       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4059       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4060       v += 9;
4061     }
4062     idc = 3*c[i];
4063     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4064     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4065     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4066   }
4067   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4068   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4069   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4070   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4071   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4072   PetscFunctionReturn(0);
4073 }
4074 
4075 /*
4076       Special case where the matrix was ILU(0) factored in the natural
4077    ordering. This eliminates the need for the column and row permutation.
4078 */
4079 #undef __FUNCT__
4080 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4081 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4082 {
4083   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4084   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4085   PetscErrorCode    ierr;
4086   PetscInt          *diag = a->diag;
4087   const MatScalar   *aa=a->a,*v;
4088   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4089   const PetscScalar *b;
4090   PetscInt          jdx,idt,idx,nz,*vi,i;
4091 
4092   PetscFunctionBegin;
4093   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4094   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4095 
4096   /* forward solve the lower triangular */
4097   idx    = 0;
4098   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4099   for (i=1; i<n; i++) {
4100     v     =  aa      + 9*ai[i];
4101     vi    =  aj      + ai[i];
4102     nz    =  diag[i] - ai[i];
4103     idx   +=  3;
4104     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4105     while (nz--) {
4106       jdx   = 3*(*vi++);
4107       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4108       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4109       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4110       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4111       v    += 9;
4112     }
4113     x[idx]   = s1;
4114     x[1+idx] = s2;
4115     x[2+idx] = s3;
4116   }
4117   /* backward solve the upper triangular */
4118   for (i=n-1; i>=0; i--){
4119     v    = aa + 9*diag[i] + 9;
4120     vi   = aj + diag[i] + 1;
4121     nz   = ai[i+1] - diag[i] - 1;
4122     idt  = 3*i;
4123     s1 = x[idt];  s2 = x[1+idt];
4124     s3 = x[2+idt];
4125     while (nz--) {
4126       idx   = 3*(*vi++);
4127       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4128       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4129       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4130       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4131       v    += 9;
4132     }
4133     v        = aa +  9*diag[i];
4134     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4135     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4136     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4137   }
4138 
4139   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4140   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4141   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4142   PetscFunctionReturn(0);
4143 }
4144 
4145 #undef __FUNCT__
4146 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4147 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4148 {
4149     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4150     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4151     PetscErrorCode    ierr;
4152     PetscInt          idx,jdx,idt;
4153     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4154     const MatScalar   *aa=a->a,*v;
4155     PetscScalar       *x;
4156     const PetscScalar *b;
4157     PetscScalar        s1,s2,s3,x1,x2,x3;
4158 
4159     PetscFunctionBegin;
4160     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4161     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4162     /* forward solve the lower triangular */
4163     idx    = 0;
4164     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4165     for (i=1; i<n; i++) {
4166        v    = aa + bs2*ai[i];
4167        vi   = aj + ai[i];
4168        nz   = ai[i+1] - ai[i];
4169       idx   = bs*i;
4170        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4171       for(k=0;k<nz;k++){
4172          jdx   = bs*vi[k];
4173           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4174           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4175           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4176           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4177 
4178           v   +=  bs2;
4179         }
4180 
4181        x[idx]   = s1;
4182        x[1+idx] = s2;
4183        x[2+idx] = s3;
4184     }
4185 
4186    /* backward solve the upper triangular */
4187   for (i=n-1; i>=0; i--){
4188      v   = aa + bs2*ai[2*n-i];
4189      vi  = aj + ai[2*n-i];
4190      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4191      idt = bs*i;
4192      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4193 
4194      for(k=0;k<nz;k++){
4195        idx   = bs*vi[k];
4196        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4197        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4198        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4199        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4200 
4201         v   +=  bs2;
4202     }
4203     /* x = inv_diagonal*x */
4204    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4205    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4206    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4207 
4208   }
4209 
4210   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4211   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4212   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4213   PetscFunctionReturn(0);
4214 }
4215 
4216 #undef __FUNCT__
4217 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4218 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4219 {
4220     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4221     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4222     PetscErrorCode    ierr;
4223     PetscInt          idx,jdx,idt;
4224     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4225     const MatScalar   *aa=a->a,*v;
4226     PetscScalar       *x;
4227     const PetscScalar *b;
4228     PetscScalar        s1,s2,s3,x1,x2,x3;
4229 
4230     PetscFunctionBegin;
4231     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4232     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4233     /* forward solve the lower triangular */
4234     idx    = 0;
4235     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4236     for (i=1; i<n; i++) {
4237        v    = aa + bs2*ai[i];
4238        vi   = aj + ai[i];
4239        nz   = ai[i+1] - ai[i];
4240       idx   = bs*i;
4241        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4242       for(k=0;k<nz;k++){
4243          jdx   = bs*vi[k];
4244           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4245           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4246           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4247           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4248 
4249           v   +=  bs2;
4250         }
4251 
4252        x[idx]   = s1;
4253        x[1+idx] = s2;
4254        x[2+idx] = s3;
4255     }
4256 
4257    /* backward solve the upper triangular */
4258   for (i=n-1; i>=0; i--){
4259     v   = aa + bs2*(adiag[i+1]+1);
4260      vi  = aj + adiag[i+1]+1;
4261      nz  = adiag[i] - adiag[i+1]-1;
4262      idt = bs*i;
4263      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4264 
4265      for(k=0;k<nz;k++){
4266        idx   = bs*vi[k];
4267        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4268        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4269        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4270        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4271 
4272         v   +=  bs2;
4273     }
4274     /* x = inv_diagonal*x */
4275    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4276    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4277    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4278 
4279   }
4280 
4281   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4282   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4283   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4284   PetscFunctionReturn(0);
4285 }
4286 
4287 #undef __FUNCT__
4288 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4289 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4290 {
4291   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4292   IS                iscol=a->col,isrow=a->row;
4293   PetscErrorCode    ierr;
4294   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4295   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4296   const MatScalar   *aa=a->a,*v;
4297   PetscScalar       *x,s1,s2,x1,x2,*t;
4298   const PetscScalar *b;
4299 
4300   PetscFunctionBegin;
4301   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4302   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4303   t  = a->solve_work;
4304 
4305   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4306   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4307 
4308   /* forward solve the lower triangular */
4309   idx    = 2*(*r++);
4310   t[0] = b[idx]; t[1] = b[1+idx];
4311   for (i=1; i<n; i++) {
4312     v     = aa + 4*ai[i];
4313     vi    = aj + ai[i];
4314     nz    = diag[i] - ai[i];
4315     idx   = 2*(*r++);
4316     s1  = b[idx]; s2 = b[1+idx];
4317     while (nz--) {
4318       idx   = 2*(*vi++);
4319       x1    = t[idx]; x2 = t[1+idx];
4320       s1 -= v[0]*x1 + v[2]*x2;
4321       s2 -= v[1]*x1 + v[3]*x2;
4322       v += 4;
4323     }
4324     idx = 2*i;
4325     t[idx] = s1; t[1+idx] = s2;
4326   }
4327   /* backward solve the upper triangular */
4328   for (i=n-1; i>=0; i--){
4329     v    = aa + 4*diag[i] + 4;
4330     vi   = aj + diag[i] + 1;
4331     nz   = ai[i+1] - diag[i] - 1;
4332     idt  = 2*i;
4333     s1 = t[idt]; s2 = t[1+idt];
4334     while (nz--) {
4335       idx   = 2*(*vi++);
4336       x1    = t[idx]; x2 = t[1+idx];
4337       s1 -= v[0]*x1 + v[2]*x2;
4338       s2 -= v[1]*x1 + v[3]*x2;
4339       v += 4;
4340     }
4341     idc = 2*(*c--);
4342     v   = aa + 4*diag[i];
4343     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4344     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4345   }
4346   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4347   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4348   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4349   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4350   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4351   PetscFunctionReturn(0);
4352 }
4353 
4354 #undef __FUNCT__
4355 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4356 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4357 {
4358   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4359   IS                iscol=a->col,isrow=a->row;
4360   PetscErrorCode    ierr;
4361   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
4362   const PetscInt    *r,*c,*rout,*cout;
4363   const MatScalar   *aa=a->a,*v;
4364   PetscScalar       *x,s1,s2,x1,x2,*t;
4365   const PetscScalar *b;
4366 
4367   PetscFunctionBegin;
4368   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4369   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4370   t  = a->solve_work;
4371 
4372   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4373   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4374 
4375   /* forward solve the lower triangular */
4376   idx    = 2*r[0];
4377   t[0] = b[idx]; t[1] = b[1+idx];
4378   for (i=1; i<n; i++) {
4379     v     = aa + 4*ai[i];
4380     vi    = aj + ai[i];
4381     nz    = ai[i+1] - ai[i];
4382     idx   = 2*r[i];
4383     s1  = b[idx]; s2 = b[1+idx];
4384     for(m=0;m<nz;m++){
4385       jdx   = 2*vi[m];
4386       x1    = t[jdx]; x2 = t[1+jdx];
4387       s1 -= v[0]*x1 + v[2]*x2;
4388       s2 -= v[1]*x1 + v[3]*x2;
4389       v += 4;
4390     }
4391     idx = 2*i;
4392     t[idx] = s1; t[1+idx] = s2;
4393   }
4394   /* backward solve the upper triangular */
4395   for (i=n-1; i>=0; i--){
4396     k = 2*n-i;
4397     v    = aa + 4*ai[k];
4398     vi   = aj + ai[k];
4399     nz   = ai[k +1] - ai[k] - 1;
4400     idt  = 2*i;
4401     s1 = t[idt]; s2 = t[1+idt];
4402     for(m=0;m<nz;m++){
4403       idx   = 2*vi[m];
4404       x1    = t[idx]; x2 = t[1+idx];
4405       s1 -= v[0]*x1 + v[2]*x2;
4406       s2 -= v[1]*x1 + v[3]*x2;
4407       v += 4;
4408     }
4409     idc = 2*c[i];
4410     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4411     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4412   }
4413   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4414   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4415   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4416   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4417   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4418   PetscFunctionReturn(0);
4419 }
4420 
4421 #undef __FUNCT__
4422 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
4423 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4424 {
4425   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4426   IS                iscol=a->col,isrow=a->row;
4427   PetscErrorCode    ierr;
4428   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4429   const PetscInt    *r,*c,*rout,*cout;
4430   const MatScalar   *aa=a->a,*v;
4431   PetscScalar       *x,s1,s2,x1,x2,*t;
4432   const PetscScalar *b;
4433 
4434   PetscFunctionBegin;
4435   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4436   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4437   t  = a->solve_work;
4438 
4439   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4440   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4441 
4442   /* forward solve the lower triangular */
4443   idx    = 2*r[0];
4444   t[0] = b[idx]; t[1] = b[1+idx];
4445   for (i=1; i<n; i++) {
4446     v     = aa + 4*ai[i];
4447     vi    = aj + ai[i];
4448     nz    = ai[i+1] - ai[i];
4449     idx   = 2*r[i];
4450     s1  = b[idx]; s2 = b[1+idx];
4451     for(m=0;m<nz;m++){
4452       jdx   = 2*vi[m];
4453       x1    = t[jdx]; x2 = t[1+jdx];
4454       s1 -= v[0]*x1 + v[2]*x2;
4455       s2 -= v[1]*x1 + v[3]*x2;
4456       v += 4;
4457     }
4458     idx = 2*i;
4459     t[idx] = s1; t[1+idx] = s2;
4460   }
4461   /* backward solve the upper triangular */
4462   for (i=n-1; i>=0; i--){
4463     v    = aa + 4*(adiag[i+1]+1);
4464     vi   = aj + adiag[i+1]+1;
4465     nz   = adiag[i] - adiag[i+1] - 1;
4466     idt  = 2*i;
4467     s1 = t[idt]; s2 = t[1+idt];
4468     for(m=0;m<nz;m++){
4469       idx   = 2*vi[m];
4470       x1    = t[idx]; x2 = t[1+idx];
4471       s1 -= v[0]*x1 + v[2]*x2;
4472       s2 -= v[1]*x1 + v[3]*x2;
4473       v += 4;
4474     }
4475     idc = 2*c[i];
4476     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4477     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4478   }
4479   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4480   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4481   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4482   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4483   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4484   PetscFunctionReturn(0);
4485 }
4486 
4487 /*
4488       Special case where the matrix was ILU(0) factored in the natural
4489    ordering. This eliminates the need for the column and row permutation.
4490 */
4491 #undef __FUNCT__
4492 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4493 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4494 {
4495   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4496   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4497   PetscErrorCode    ierr;
4498   PetscInt          *diag = a->diag;
4499   const MatScalar   *aa=a->a,*v;
4500   PetscScalar       *x,s1,s2,x1,x2;
4501   const PetscScalar *b;
4502   PetscInt          jdx,idt,idx,nz,*vi,i;
4503 
4504   PetscFunctionBegin;
4505   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4506   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4507 
4508   /* forward solve the lower triangular */
4509   idx    = 0;
4510   x[0]   = b[0]; x[1] = b[1];
4511   for (i=1; i<n; i++) {
4512     v     =  aa      + 4*ai[i];
4513     vi    =  aj      + ai[i];
4514     nz    =  diag[i] - ai[i];
4515     idx   +=  2;
4516     s1  =  b[idx];s2 = b[1+idx];
4517     while (nz--) {
4518       jdx   = 2*(*vi++);
4519       x1    = x[jdx];x2 = x[1+jdx];
4520       s1 -= v[0]*x1 + v[2]*x2;
4521       s2 -= v[1]*x1 + v[3]*x2;
4522       v    += 4;
4523     }
4524     x[idx]   = s1;
4525     x[1+idx] = s2;
4526   }
4527   /* backward solve the upper triangular */
4528   for (i=n-1; i>=0; i--){
4529     v    = aa + 4*diag[i] + 4;
4530     vi   = aj + diag[i] + 1;
4531     nz   = ai[i+1] - diag[i] - 1;
4532     idt  = 2*i;
4533     s1 = x[idt];  s2 = x[1+idt];
4534     while (nz--) {
4535       idx   = 2*(*vi++);
4536       x1    = x[idx];   x2 = x[1+idx];
4537       s1 -= v[0]*x1 + v[2]*x2;
4538       s2 -= v[1]*x1 + v[3]*x2;
4539       v    += 4;
4540     }
4541     v        = aa +  4*diag[i];
4542     x[idt]   = v[0]*s1 + v[2]*s2;
4543     x[1+idt] = v[1]*s1 + v[3]*s2;
4544   }
4545 
4546   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4547   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4548   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4549   PetscFunctionReturn(0);
4550 }
4551 
4552 #undef __FUNCT__
4553 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4554 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4555 {
4556     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4557     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4558     PetscErrorCode    ierr;
4559     PetscInt          jdx;
4560     const MatScalar   *aa=a->a,*v;
4561     PetscScalar       *x,s1,s2,x1,x2;
4562     const PetscScalar *b;
4563 
4564     PetscFunctionBegin;
4565     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4566     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4567     /* forward solve the lower triangular */
4568     idx    = 0;
4569     x[0] = b[idx]; x[1] = b[1+idx];
4570     for (i=1; i<n; i++) {
4571         v   = aa + 4*ai[i];
4572        vi   = aj + ai[i];
4573        nz   = ai[i+1] - ai[i];
4574        idx  = 2*i;
4575        s1   = b[idx];s2 = b[1+idx];
4576       for(k=0;k<nz;k++){
4577          jdx   = 2*vi[k];
4578           x1    = x[jdx];x2 = x[1+jdx];
4579           s1   -= v[0]*x1 + v[2]*x2;
4580           s2   -= v[1]*x1 + v[3]*x2;
4581            v   +=  4;
4582         }
4583        x[idx]   = s1;
4584        x[1+idx] = s2;
4585     }
4586 
4587    /* backward solve the upper triangular */
4588   for (i=n-1; i>=0; i--){
4589      v   = aa + 4*ai[2*n-i];
4590      vi  = aj + ai[2*n-i];
4591      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4592      idt = 2*i;
4593      s1 = x[idt];  s2 = x[1+idt];
4594      for(k=0;k<nz;k++){
4595       idx   = 2*vi[k];
4596        x1    = x[idx];   x2 = x[1+idx];
4597        s1 -= v[0]*x1 + v[2]*x2;
4598        s2 -= v[1]*x1 + v[3]*x2;
4599          v    += 4;
4600     }
4601     /* x = inv_diagonal*x */
4602    x[idt]   = v[0]*s1 + v[2]*s2;
4603    x[1+idt] = v[1]*s1 + v[3]*s2;
4604   }
4605 
4606   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4608   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4609   PetscFunctionReturn(0);
4610 }
4611 
4612 #undef __FUNCT__
4613 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4614 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4615 {
4616     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4617     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4618     PetscErrorCode    ierr;
4619     PetscInt          jdx;
4620     const MatScalar   *aa=a->a,*v;
4621     PetscScalar       *x,s1,s2,x1,x2;
4622     const PetscScalar *b;
4623 
4624     PetscFunctionBegin;
4625     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4626     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4627     /* forward solve the lower triangular */
4628     idx    = 0;
4629     x[0] = b[idx]; x[1] = b[1+idx];
4630     for (i=1; i<n; i++) {
4631         v   = aa + 4*ai[i];
4632        vi   = aj + ai[i];
4633        nz   = ai[i+1] - ai[i];
4634        idx  = 2*i;
4635        s1   = b[idx];s2 = b[1+idx];
4636       for(k=0;k<nz;k++){
4637          jdx   = 2*vi[k];
4638           x1    = x[jdx];x2 = x[1+jdx];
4639           s1   -= v[0]*x1 + v[2]*x2;
4640           s2   -= v[1]*x1 + v[3]*x2;
4641            v   +=  4;
4642         }
4643        x[idx]   = s1;
4644        x[1+idx] = s2;
4645     }
4646 
4647    /* backward solve the upper triangular */
4648   for (i=n-1; i>=0; i--){
4649      v   = aa + 4*(adiag[i+1]+1);
4650      vi  = aj + adiag[i+1]+1;
4651      nz  = adiag[i] - adiag[i+1]-1;
4652      idt = 2*i;
4653      s1 = x[idt];  s2 = x[1+idt];
4654      for(k=0;k<nz;k++){
4655       idx   = 2*vi[k];
4656        x1    = x[idx];   x2 = x[1+idx];
4657        s1 -= v[0]*x1 + v[2]*x2;
4658        s2 -= v[1]*x1 + v[3]*x2;
4659          v    += 4;
4660     }
4661     /* x = inv_diagonal*x */
4662    x[idt]   = v[0]*s1 + v[2]*s2;
4663    x[1+idt] = v[1]*s1 + v[3]*s2;
4664   }
4665 
4666   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4667   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4668   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4669   PetscFunctionReturn(0);
4670 }
4671 
4672 #undef __FUNCT__
4673 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4674 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4675 {
4676   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4677   IS             iscol=a->col,isrow=a->row;
4678   PetscErrorCode ierr;
4679   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4680   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4681   MatScalar      *aa=a->a,*v;
4682   PetscScalar    *x,*b,s1,*t;
4683 
4684   PetscFunctionBegin;
4685   if (!n) PetscFunctionReturn(0);
4686 
4687   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4688   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4689   t  = a->solve_work;
4690 
4691   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4692   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4693 
4694   /* forward solve the lower triangular */
4695   t[0] = b[*r++];
4696   for (i=1; i<n; i++) {
4697     v     = aa + ai[i];
4698     vi    = aj + ai[i];
4699     nz    = diag[i] - ai[i];
4700     s1  = b[*r++];
4701     while (nz--) {
4702       s1 -= (*v++)*t[*vi++];
4703     }
4704     t[i] = s1;
4705   }
4706   /* backward solve the upper triangular */
4707   for (i=n-1; i>=0; i--){
4708     v    = aa + diag[i] + 1;
4709     vi   = aj + diag[i] + 1;
4710     nz   = ai[i+1] - diag[i] - 1;
4711     s1 = t[i];
4712     while (nz--) {
4713       s1 -= (*v++)*t[*vi++];
4714     }
4715     x[*c--] = t[i] = aa[diag[i]]*s1;
4716   }
4717 
4718   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4719   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4720   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4721   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4722   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4723   PetscFunctionReturn(0);
4724 }
4725 /*
4726       Special case where the matrix was ILU(0) factored in the natural
4727    ordering. This eliminates the need for the column and row permutation.
4728 */
4729 #undef __FUNCT__
4730 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4731 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4732 {
4733   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4734   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4735   PetscErrorCode ierr;
4736   PetscInt       *diag = a->diag;
4737   MatScalar      *aa=a->a;
4738   PetscScalar    *x,*b;
4739   PetscScalar    s1,x1;
4740   MatScalar      *v;
4741   PetscInt       jdx,idt,idx,nz,*vi,i;
4742 
4743   PetscFunctionBegin;
4744   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4746 
4747   /* forward solve the lower triangular */
4748   idx    = 0;
4749   x[0]   = b[0];
4750   for (i=1; i<n; i++) {
4751     v     =  aa      + ai[i];
4752     vi    =  aj      + ai[i];
4753     nz    =  diag[i] - ai[i];
4754     idx   +=  1;
4755     s1  =  b[idx];
4756     while (nz--) {
4757       jdx   = *vi++;
4758       x1    = x[jdx];
4759       s1 -= v[0]*x1;
4760       v    += 1;
4761     }
4762     x[idx]   = s1;
4763   }
4764   /* backward solve the upper triangular */
4765   for (i=n-1; i>=0; i--){
4766     v    = aa + diag[i] + 1;
4767     vi   = aj + diag[i] + 1;
4768     nz   = ai[i+1] - diag[i] - 1;
4769     idt  = i;
4770     s1 = x[idt];
4771     while (nz--) {
4772       idx   = *vi++;
4773       x1    = x[idx];
4774       s1 -= v[0]*x1;
4775       v    += 1;
4776     }
4777     v        = aa +  diag[i];
4778     x[idt]   = v[0]*s1;
4779   }
4780   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4781   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4782   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4783   PetscFunctionReturn(0);
4784 }
4785 
4786 /* ----------------------------------------------------------------*/
4787 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4788 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4789 
4790 #undef __FUNCT__
4791 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4792 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4793 {
4794   Mat            C=B;
4795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4796   IS             isrow = b->row,isicol = b->icol;
4797   PetscErrorCode ierr;
4798   const PetscInt *r,*ic,*ics;
4799   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4800   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4801   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4802   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4803   MatScalar      *v_work;
4804 
4805   PetscFunctionBegin;
4806   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4807   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4808   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4809   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
4810   ics  = ic;
4811 
4812   /* generate work space needed by dense LU factorization */
4813   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4814   mwork    = v_work + bs;
4815   v_pivots = (PetscInt*)(mwork + bs2);
4816 
4817   for (i=0; i<n; i++){
4818     /* zero rtmp */
4819     /* L part */
4820     nz    = bi[i+1] - bi[i];
4821     bjtmp = bj + bi[i];
4822     for  (j=0; j<nz; j++){
4823       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4824     }
4825 
4826     /* U part */
4827     nz = bi[2*n-i+1] - bi[2*n-i];
4828     bjtmp = bj + bi[2*n-i];
4829     for  (j=0; j<nz; j++){
4830       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4831     }
4832 
4833     /* load in initial (unfactored row) */
4834     nz    = ai[r[i]+1] - ai[r[i]];
4835     ajtmp = aj + ai[r[i]];
4836     v     = aa + bs2*ai[r[i]];
4837     for (j=0; j<nz; j++) {
4838       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4839     }
4840 
4841     /* elimination */
4842     bjtmp = bj + bi[i];
4843     nzL   = bi[i+1] - bi[i];
4844     for(k=0;k < nzL;k++) {
4845       row = bjtmp[k];
4846       pc = rtmp + bs2*row;
4847       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4848       if (flg) {
4849         pv         = b->a + bs2*bdiag[row];
4850         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4851         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4852         pv         = b->a + bs2*bi[2*n-row];
4853         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4854         for (j=0; j<nz; j++) {
4855           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4856         }
4857         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4858       }
4859     }
4860 
4861     /* finished row so stick it into b->a */
4862     /* L part */
4863     pv   = b->a + bs2*bi[i] ;
4864     pj   = b->j + bi[i] ;
4865     nz   = bi[i+1] - bi[i];
4866     for (j=0; j<nz; j++) {
4867       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4868     }
4869 
4870     /* Mark diagonal and invert diagonal for simplier triangular solves */
4871     pv  = b->a + bs2*bdiag[i];
4872     pj  = b->j + bdiag[i];
4873     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4874     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4875     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4876 
4877     /* U part */
4878     pv = b->a + bs2*bi[2*n-i];
4879     pj = b->j + bi[2*n-i];
4880     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4881     for (j=0; j<nz; j++){
4882       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4883     }
4884   }
4885 
4886   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4887   ierr = PetscFree(v_work);CHKERRQ(ierr);
4888   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4889   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4890 
4891   C->assembled = PETSC_TRUE;
4892   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4893   PetscFunctionReturn(0);
4894 }
4895 
4896 #undef __FUNCT__
4897 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
4898 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
4899 {
4900   Mat            C=B;
4901   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4902   IS             isrow = b->row,isicol = b->icol;
4903   PetscErrorCode ierr;
4904   const PetscInt *r,*ic,*ics;
4905   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4906   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4907   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4908   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4909   MatScalar      *v_work;
4910 
4911   PetscFunctionBegin;
4912   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4913   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4914   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4915   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
4916   ics  = ic;
4917 
4918   /* generate work space needed by dense LU factorization */
4919   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4920   mwork    = v_work + bs;
4921   v_pivots = (PetscInt*)(mwork + bs2);
4922 
4923   for (i=0; i<n; i++){
4924     /* zero rtmp */
4925     /* L part */
4926     nz    = bi[i+1] - bi[i];
4927     bjtmp = bj + bi[i];
4928     for  (j=0; j<nz; j++){
4929       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4930     }
4931 
4932     /* U part */
4933     nz = bdiag[i] - bdiag[i+1];
4934     bjtmp = bj + bdiag[i+1]+1;
4935     for  (j=0; j<nz; j++){
4936       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4937     }
4938 
4939     /* load in initial (unfactored row) */
4940     nz    = ai[r[i]+1] - ai[r[i]];
4941     ajtmp = aj + ai[r[i]];
4942     v     = aa + bs2*ai[r[i]];
4943     for (j=0; j<nz; j++) {
4944       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4945     }
4946 
4947     /* elimination */
4948     bjtmp = bj + bi[i];
4949     nzL   = bi[i+1] - bi[i];
4950     for(k=0;k < nzL;k++) {
4951       row = bjtmp[k];
4952       pc = rtmp + bs2*row;
4953       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4954       if (flg) {
4955         pv         = b->a + bs2*bdiag[row];
4956         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4957         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
4958         pv         = b->a + bs2*(bdiag[row+1]+1);
4959         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
4960         for (j=0; j<nz; j++) {
4961           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4962         }
4963         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4964       }
4965     }
4966 
4967     /* finished row so stick it into b->a */
4968     /* L part */
4969     pv   = b->a + bs2*bi[i] ;
4970     pj   = b->j + bi[i] ;
4971     nz   = bi[i+1] - bi[i];
4972     for (j=0; j<nz; j++) {
4973       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4974     }
4975 
4976     /* Mark diagonal and invert diagonal for simplier triangular solves */
4977     pv  = b->a + bs2*bdiag[i];
4978     pj  = b->j + bdiag[i];
4979     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4980     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4981     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4982 
4983     /* U part */
4984     pv = b->a + bs2*(bdiag[i+1]+1);
4985     pj = b->j + bdiag[i+1]+1;
4986     nz = bdiag[i] - bdiag[i+1] - 1;
4987     for (j=0; j<nz; j++){
4988       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4989     }
4990   }
4991 
4992   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4993   ierr = PetscFree(v_work);CHKERRQ(ierr);
4994   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4995   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4996 
4997   C->assembled = PETSC_TRUE;
4998   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4999   PetscFunctionReturn(0);
5000 }
5001 
5002 /*
5003    ilu(0) with natural ordering under new data structure.
5004    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5005    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5006 */
5007 #undef __FUNCT__
5008 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5009 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5010 {
5011 
5012   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5013   PetscErrorCode     ierr;
5014   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5015   PetscInt           i,j,nz,*bi,*bj,*bdiag;
5016 
5017   PetscFunctionBegin;
5018   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
5019   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5020   b    = (Mat_SeqBAIJ*)(fact)->data;
5021 
5022   /* allocate matrix arrays for new data structure */
5023   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
5024   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
5025   b->singlemalloc = PETSC_TRUE;
5026   if (!b->diag){
5027     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5028   }
5029   bdiag = b->diag;
5030 
5031   if (n > 0) {
5032     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5033   }
5034 
5035   /* set bi and bj with new data structure */
5036   bi = b->i;
5037   bj = b->j;
5038 
5039   /* L part */
5040   bi[0] = 0;
5041   for (i=0; i<n; i++){
5042     nz = adiag[i] - ai[i];
5043     bi[i+1] = bi[i] + nz;
5044     aj = a->j + ai[i];
5045     for (j=0; j<nz; j++){
5046       *bj = aj[j]; bj++;
5047     }
5048   }
5049 
5050   /* U part */
5051   bi[n+1] = bi[n];
5052   for (i=n-1; i>=0; i--){
5053     nz = ai[i+1] - adiag[i] - 1;
5054     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
5055     aj = a->j + adiag[i] + 1;
5056     for (j=0; j<nz; j++){
5057       *bj = aj[j]; bj++;
5058     }
5059     /* diag[i] */
5060     *bj = i; bj++;
5061     bdiag[i] = bi[2*n-i+1]-1;
5062   }
5063   PetscFunctionReturn(0);
5064 }
5065 
5066 #undef __FUNCT__
5067 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5068 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5069 {
5070   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5071   IS                 isicol;
5072   PetscErrorCode     ierr;
5073   const PetscInt     *r,*ic;
5074   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5075   PetscInt           *bi,*cols,nnz,*cols_lvl;
5076   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5077   PetscInt           i,levels,diagonal_fill;
5078   PetscTruth         col_identity,row_identity,both_identity;
5079   PetscReal          f;
5080   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5081   PetscBT            lnkbt;
5082   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5083   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5084   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5085   PetscTruth         missing;
5086   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5087 
5088   PetscFunctionBegin;
5089   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5090   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5091   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5092 
5093   f             = info->fill;
5094   levels        = (PetscInt)info->levels;
5095   diagonal_fill = (PetscInt)info->diagonal_fill;
5096   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5097 
5098   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5099   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5100   both_identity = (PetscTruth) (row_identity && col_identity);
5101 
5102   if (!levels && both_identity) {
5103     /* special case: ilu(0) with natural ordering */
5104     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5105     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5106     /* set MatSolve routines */
5107     switch (bs){
5108     case 2:
5109       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
5110       break;
5111     case 3:
5112       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
5113       break;
5114     case 4:
5115       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
5116       break;
5117     case 5:
5118       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
5119       break;
5120     case 6:
5121       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
5122       break;
5123     case 7:
5124       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
5125       break;
5126     default:
5127       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5128       break;
5129     }
5130 
5131     fact->factor = MAT_FACTOR_ILU;
5132     (fact)->info.factor_mallocs    = 0;
5133     (fact)->info.fill_ratio_given  = info->fill;
5134     (fact)->info.fill_ratio_needed = 1.0;
5135     b                = (Mat_SeqBAIJ*)(fact)->data;
5136     b->row           = isrow;
5137     b->col           = iscol;
5138     b->icol          = isicol;
5139     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5140     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5141     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5142     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5143     PetscFunctionReturn(0);
5144   }
5145 
5146   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5147   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5148 
5149   /* get new row pointers */
5150   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5151   bi[0] = 0;
5152   /* bdiag is location of diagonal in factor */
5153   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5154   bdiag[0]  = 0;
5155 
5156   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
5157   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
5158 
5159   /* create a linked list for storing column indices of the active row */
5160   nlnk = n + 1;
5161   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5162 
5163   /* initial FreeSpace size is f*(ai[n]+1) */
5164   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5165   current_space = free_space;
5166   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5167   current_space_lvl = free_space_lvl;
5168 
5169   for (i=0; i<n; i++) {
5170     nzi = 0;
5171     /* copy current row into linked list */
5172     nnz  = ai[r[i]+1] - ai[r[i]];
5173     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5174     cols = aj + ai[r[i]];
5175     lnk[i] = -1; /* marker to indicate if diagonal exists */
5176     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5177     nzi += nlnk;
5178 
5179     /* make sure diagonal entry is included */
5180     if (diagonal_fill && lnk[i] == -1) {
5181       fm = n;
5182       while (lnk[fm] < i) fm = lnk[fm];
5183       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5184       lnk[fm]    = i;
5185       lnk_lvl[i] = 0;
5186       nzi++; dcount++;
5187     }
5188 
5189     /* add pivot rows into the active row */
5190     nzbd = 0;
5191     prow = lnk[n];
5192     while (prow < i) {
5193       nnz      = bdiag[prow];
5194       cols     = bj_ptr[prow] + nnz + 1;
5195       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5196       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5197       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5198       nzi += nlnk;
5199       prow = lnk[prow];
5200       nzbd++;
5201     }
5202     bdiag[i] = nzbd;
5203     bi[i+1]  = bi[i] + nzi;
5204 
5205     /* if free space is not available, make more free space */
5206     if (current_space->local_remaining<nzi) {
5207       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5208       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5209       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5210       reallocs++;
5211     }
5212 
5213     /* copy data into free_space and free_space_lvl, then initialize lnk */
5214     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5215     bj_ptr[i]    = current_space->array;
5216     bjlvl_ptr[i] = current_space_lvl->array;
5217 
5218     /* make sure the active row i has diagonal entry */
5219     if (*(bj_ptr[i]+bdiag[i]) != i) {
5220       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5221     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5222     }
5223 
5224     current_space->array           += nzi;
5225     current_space->local_used      += nzi;
5226     current_space->local_remaining -= nzi;
5227     current_space_lvl->array           += nzi;
5228     current_space_lvl->local_used      += nzi;
5229     current_space_lvl->local_remaining -= nzi;
5230   }
5231 
5232   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5233   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5234 
5235   /* destroy list of free space and other temporary arrays */
5236   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5237 
5238   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5239   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5240 
5241   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5242   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5243   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
5244 
5245 #if defined(PETSC_USE_INFO)
5246   {
5247     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5248     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5249     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5250     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5251     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5252     if (diagonal_fill) {
5253       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5254     }
5255   }
5256 #endif
5257 
5258   /* put together the new matrix */
5259   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5260   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5261   b = (Mat_SeqBAIJ*)(fact)->data;
5262   b->free_a       = PETSC_TRUE;
5263   b->free_ij      = PETSC_TRUE;
5264   b->singlemalloc = PETSC_FALSE;
5265   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5266   b->j          = bj;
5267   b->i          = bi;
5268   b->diag       = bdiag;
5269   b->free_diag  = PETSC_TRUE;
5270   b->ilen       = 0;
5271   b->imax       = 0;
5272   b->row        = isrow;
5273   b->col        = iscol;
5274   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5275   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5276   b->icol       = isicol;
5277   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5278   /* In b structure:  Free imax, ilen, old a, old j.
5279      Allocate bdiag, solve_work, new a, new j */
5280   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5281   b->maxnz = b->nz = bi[2*n+1] ;
5282   (fact)->info.factor_mallocs    = reallocs;
5283   (fact)->info.fill_ratio_given  = f;
5284   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
5285   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5286   /* set MatSolve routines */
5287   if (both_identity){
5288     switch (bs){
5289     case 2:
5290       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
5291       break;
5292     case 3:
5293       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
5294       break;
5295     case 4:
5296       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
5297       break;
5298     case 5:
5299       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
5300       break;
5301     case 6:
5302       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
5303       break;
5304     case 7:
5305       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
5306       break;
5307     default:
5308       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5309       break;
5310     }
5311   } else {
5312     switch (bs){
5313     case 2:
5314       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
5315       break;
5316     case 3:
5317       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
5318       break;
5319     case 4:
5320       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
5321       break;
5322     case 5:
5323       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
5324       break;
5325     case 6:
5326       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
5327       break;
5328     case 7:
5329       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
5330       break;
5331     default:
5332       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5333       break;
5334     }
5335   }
5336   PetscFunctionReturn(0);
5337 }
5338 
5339 /*
5340      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5341    except that the data structure of Mat_SeqAIJ is slightly different.
5342    Not a good example of code reuse.
5343 */
5344 #undef __FUNCT__
5345 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5346 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5347 {
5348   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5349   IS             isicol;
5350   PetscErrorCode ierr;
5351   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5352   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5353   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5354   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5355   PetscTruth     col_identity,row_identity,both_identity,flg;
5356   PetscReal      f;
5357   PetscTruth     newdatastruct=PETSC_FALSE;
5358 
5359   PetscFunctionBegin;
5360   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5361   if (newdatastruct){
5362     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5363     PetscFunctionReturn(0);
5364   }
5365 
5366   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5367   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5368 
5369   f             = info->fill;
5370   levels        = (PetscInt)info->levels;
5371   diagonal_fill = (PetscInt)info->diagonal_fill;
5372   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5373 
5374   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5375   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5376   both_identity = (PetscTruth) (row_identity && col_identity);
5377 
5378   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5379     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5380     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5381 
5382     fact->factor = MAT_FACTOR_ILU;
5383     b            = (Mat_SeqBAIJ*)(fact)->data;
5384     b->row       = isrow;
5385     b->col       = iscol;
5386     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5387     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5388     b->icol      = isicol;
5389     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5390     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5391     PetscFunctionReturn(0);
5392   }
5393 
5394   /* general case perform the symbolic factorization */
5395     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5396     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5397 
5398     /* get new row pointers */
5399     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5400     ainew[0] = 0;
5401     /* don't know how many column pointers are needed so estimate */
5402     jmax = (PetscInt)(f*ai[n] + 1);
5403     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5404     /* ajfill is level of fill for each fill entry */
5405     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5406     /* fill is a linked list of nonzeros in active row */
5407     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5408     /* im is level for each filled value */
5409     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5410     /* dloc is location of diagonal in factor */
5411     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5412     dloc[0]  = 0;
5413     for (prow=0; prow<n; prow++) {
5414 
5415       /* copy prow into linked list */
5416       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5417       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5418       xi         = aj + ai[r[prow]];
5419       fill[n]    = n;
5420       fill[prow] = -1; /* marker for diagonal entry */
5421       while (nz--) {
5422 	fm  = n;
5423 	idx = ic[*xi++];
5424 	do {
5425 	  m  = fm;
5426 	  fm = fill[m];
5427 	} while (fm < idx);
5428 	fill[m]   = idx;
5429 	fill[idx] = fm;
5430 	im[idx]   = 0;
5431       }
5432 
5433       /* make sure diagonal entry is included */
5434       if (diagonal_fill && fill[prow] == -1) {
5435 	fm = n;
5436 	while (fill[fm] < prow) fm = fill[fm];
5437 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5438 	fill[fm]   = prow;
5439 	im[prow]   = 0;
5440 	nzf++;
5441 	dcount++;
5442       }
5443 
5444       nzi = 0;
5445       row = fill[n];
5446       while (row < prow) {
5447 	incrlev = im[row] + 1;
5448 	nz      = dloc[row];
5449 	xi      = ajnew  + ainew[row] + nz + 1;
5450 	flev    = ajfill + ainew[row] + nz + 1;
5451 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5452 	fm      = row;
5453 	while (nnz-- > 0) {
5454 	  idx = *xi++;
5455 	  if (*flev + incrlev > levels) {
5456 	    flev++;
5457 	    continue;
5458 	  }
5459 	  do {
5460 	    m  = fm;
5461 	    fm = fill[m];
5462 	  } while (fm < idx);
5463 	  if (fm != idx) {
5464 	    im[idx]   = *flev + incrlev;
5465 	    fill[m]   = idx;
5466 	    fill[idx] = fm;
5467 	    fm        = idx;
5468 	    nzf++;
5469 	  } else {
5470 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5471 	  }
5472 	  flev++;
5473 	}
5474 	row = fill[row];
5475 	nzi++;
5476       }
5477       /* copy new filled row into permanent storage */
5478       ainew[prow+1] = ainew[prow] + nzf;
5479       if (ainew[prow+1] > jmax) {
5480 
5481 	/* estimate how much additional space we will need */
5482 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5483 	/* just double the memory each time */
5484 	PetscInt maxadd = jmax;
5485 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5486 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5487 	jmax += maxadd;
5488 
5489 	/* allocate a longer ajnew and ajfill */
5490 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5491 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5492 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5493 	ajnew = xitmp;
5494 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5495 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5496 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5497 	ajfill = xitmp;
5498 	reallocate++; /* count how many reallocations are needed */
5499       }
5500       xitmp       = ajnew + ainew[prow];
5501       flev        = ajfill + ainew[prow];
5502       dloc[prow]  = nzi;
5503       fm          = fill[n];
5504       while (nzf--) {
5505 	*xitmp++ = fm;
5506 	*flev++ = im[fm];
5507 	fm      = fill[fm];
5508       }
5509       /* make sure row has diagonal entry */
5510       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5511 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5512     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5513       }
5514     }
5515     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5516     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5517     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5518     ierr = PetscFree(fill);CHKERRQ(ierr);
5519     ierr = PetscFree(im);CHKERRQ(ierr);
5520 
5521 #if defined(PETSC_USE_INFO)
5522     {
5523       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5524       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5525       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5526       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5527       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5528       if (diagonal_fill) {
5529 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5530       }
5531     }
5532 #endif
5533 
5534     /* put together the new matrix */
5535     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5536     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5537     b    = (Mat_SeqBAIJ*)(fact)->data;
5538     b->free_a       = PETSC_TRUE;
5539     b->free_ij      = PETSC_TRUE;
5540     b->singlemalloc = PETSC_FALSE;
5541     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5542     b->j          = ajnew;
5543     b->i          = ainew;
5544     for (i=0; i<n; i++) dloc[i] += ainew[i];
5545     b->diag       = dloc;
5546     b->free_diag  = PETSC_TRUE;
5547     b->ilen       = 0;
5548     b->imax       = 0;
5549     b->row        = isrow;
5550     b->col        = iscol;
5551     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5552     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5553     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5554     b->icol       = isicol;
5555     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5556     /* In b structure:  Free imax, ilen, old a, old j.
5557        Allocate dloc, solve_work, new a, new j */
5558     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5559     b->maxnz          = b->nz = ainew[n];
5560 
5561     (fact)->info.factor_mallocs    = reallocate;
5562     (fact)->info.fill_ratio_given  = f;
5563     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5564 
5565   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5566   PetscFunctionReturn(0);
5567 }
5568 
5569 #undef __FUNCT__
5570 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5571 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5572 {
5573   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5574   /* int i,*AJ=a->j,nz=a->nz; */
5575   PetscFunctionBegin;
5576   /* Undo Column scaling */
5577 /*    while (nz--) { */
5578 /*      AJ[i] = AJ[i]/4; */
5579 /*    } */
5580   /* This should really invoke a push/pop logic, but we don't have that yet. */
5581   A->ops->setunfactored = PETSC_NULL;
5582   PetscFunctionReturn(0);
5583 }
5584 
5585 #undef __FUNCT__
5586 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5587 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5588 {
5589   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5590   PetscInt       *AJ=a->j,nz=a->nz;
5591   unsigned short *aj=(unsigned short *)AJ;
5592   PetscFunctionBegin;
5593   /* Is this really necessary? */
5594   while (nz--) {
5595     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5596   }
5597   A->ops->setunfactored = PETSC_NULL;
5598   PetscFunctionReturn(0);
5599 }
5600 
5601 
5602