xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision db4fd139fb79ff8a7651981cdeaefb7e43c6030b)
1 /*$Id: sbaijfact2.c,v 1.1 2000/06/21 15:47:02 balay Exp balay $*/
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include "sbaij.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/ilu.h"
10 #include "src/inline/dot.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering"
14 int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
17   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   int             *diag = a->diag;
19   MatScalar       *aa=a->a,*v;
20   Scalar          s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*b[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   PLogFlops(2*(a->nz) - a->n);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNC__
56 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering"
57 int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   Scalar          s1,s2,x1,x2;
64   Scalar          *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
68   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
69 
70   /* forward solve the U^T */
71   idx = 0;
72   for (i=0; i<n; i++) {
73 
74     v     = aa + 4*diag[i];
75     /* multiply by the inverse of the block diagonal */
76     x1 = b[idx];   x2 = b[1+idx];
77     s1 = v[0]*x1  +  v[1]*x2;
78     s2 = v[2]*x1  +  v[3]*x2;
79     v += 4;
80 
81     vi    = aj + diag[i] + 1;
82     nz    = ai[i+1] - diag[i] - 1;
83     while (nz--) {
84       oidx = 2*(*vi++);
85       x[oidx]   -= v[0]*s1  +  v[1]*s2;
86       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
87       v  += 4;
88     }
89     x[idx]   = s1;x[1+idx] = s2;
90     idx += 2;
91   }
92   /* backward solve the L^T */
93   for (i=n-1; i>=0; i--){
94     v    = aa + 4*diag[i] - 4;
95     vi   = aj + diag[i] - 1;
96     nz   = diag[i] - ai[i];
97     idt  = 2*i;
98     s1   = x[idt];  s2 = x[1+idt];
99     while (nz--) {
100       idx   = 2*(*vi--);
101       x[idx]   -=  v[0]*s1 +  v[1]*s2;
102       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
103       v -= 4;
104     }
105   }
106   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
107   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
108   PLogFlops(2*4*(a->nz) - 2*a->n);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNC__
113 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering"
114 int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115 {
116   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
117   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118   int             *diag = a->diag,oidx;
119   MatScalar       *aa=a->a,*v;
120   Scalar          s1,s2,s3,x1,x2,x3;
121   Scalar          *x,*b;
122 
123   PetscFunctionBegin;
124   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126 
127   /* forward solve the U^T */
128   idx = 0;
129   for (i=0; i<n; i++) {
130 
131     v     = aa + 9*diag[i];
132     /* multiply by the inverse of the block diagonal */
133     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
137     v += 9;
138 
139     vi    = aj + diag[i] + 1;
140     nz    = ai[i+1] - diag[i] - 1;
141     while (nz--) {
142       oidx = 3*(*vi++);
143       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
144       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
145       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
146       v  += 9;
147     }
148     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
149     idx += 3;
150   }
151   /* backward solve the L^T */
152   for (i=n-1; i>=0; i--){
153     v    = aa + 9*diag[i] - 9;
154     vi   = aj + diag[i] - 1;
155     nz   = diag[i] - ai[i];
156     idt  = 3*i;
157     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
158     while (nz--) {
159       idx   = 3*(*vi--);
160       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
161       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
162       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
163       v -= 9;
164     }
165   }
166   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
168   PLogFlops(2*9*(a->nz) - 3*a->n);
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNC__
173 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering"
174 int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
175 {
176   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
177   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
178   int             *diag = a->diag,oidx;
179   MatScalar       *aa=a->a,*v;
180   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
181   Scalar          *x,*b;
182 
183   PetscFunctionBegin;
184   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
186 
187   /* forward solve the U^T */
188   idx = 0;
189   for (i=0; i<n; i++) {
190 
191     v     = aa + 16*diag[i];
192     /* multiply by the inverse of the block diagonal */
193     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
194     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
195     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
196     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
197     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
198     v += 16;
199 
200     vi    = aj + diag[i] + 1;
201     nz    = ai[i+1] - diag[i] - 1;
202     while (nz--) {
203       oidx = 4*(*vi++);
204       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
205       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
206       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
207       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
208       v  += 16;
209     }
210     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
211     idx += 4;
212   }
213   /* backward solve the L^T */
214   for (i=n-1; i>=0; i--){
215     v    = aa + 16*diag[i] - 16;
216     vi   = aj + diag[i] - 1;
217     nz   = diag[i] - ai[i];
218     idt  = 4*i;
219     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
220     while (nz--) {
221       idx   = 4*(*vi--);
222       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
223       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
224       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
225       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
226       v -= 16;
227     }
228   }
229   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
230   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
231   PLogFlops(2*16*(a->nz) - 4*a->n);
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNC__
236 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering"
237 int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
238 {
239   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
240   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
241   int             *diag = a->diag,oidx;
242   MatScalar       *aa=a->a,*v;
243   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
244   Scalar          *x,*b;
245 
246   PetscFunctionBegin;
247   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249 
250   /* forward solve the U^T */
251   idx = 0;
252   for (i=0; i<n; i++) {
253 
254     v     = aa + 25*diag[i];
255     /* multiply by the inverse of the block diagonal */
256     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
257     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
258     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
259     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
260     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
261     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
262     v += 25;
263 
264     vi    = aj + diag[i] + 1;
265     nz    = ai[i+1] - diag[i] - 1;
266     while (nz--) {
267       oidx = 5*(*vi++);
268       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
269       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
270       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
271       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
272       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
273       v  += 25;
274     }
275     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
276     idx += 5;
277   }
278   /* backward solve the L^T */
279   for (i=n-1; i>=0; i--){
280     v    = aa + 25*diag[i] - 25;
281     vi   = aj + diag[i] - 1;
282     nz   = diag[i] - ai[i];
283     idt  = 5*i;
284     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
285     while (nz--) {
286       idx   = 5*(*vi--);
287       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
288       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
289       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
290       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
291       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
292       v -= 25;
293     }
294   }
295   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
296   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
297   PLogFlops(2*25*(a->nz) - 5*a->n);
298   PetscFunctionReturn(0);
299 }
300 
301 #undef __FUNC__
302 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering"
303 int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
304 {
305   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
306   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   int             *diag = a->diag,oidx;
308   MatScalar       *aa=a->a,*v;
309   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
310   Scalar          *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
314   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315 
316   /* forward solve the U^T */
317   idx = 0;
318   for (i=0; i<n; i++) {
319 
320     v     = aa + 36*diag[i];
321     /* multiply by the inverse of the block diagonal */
322     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
323     x6    = b[5+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
325     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
326     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
327     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
328     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
329     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
330     v += 36;
331 
332     vi    = aj + diag[i] + 1;
333     nz    = ai[i+1] - diag[i] - 1;
334     while (nz--) {
335       oidx = 6*(*vi++);
336       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
337       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
338       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
339       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
340       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
341       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
342       v  += 36;
343     }
344     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
345     x[5+idx] = s6;
346     idx += 6;
347   }
348   /* backward solve the L^T */
349   for (i=n-1; i>=0; i--){
350     v    = aa + 36*diag[i] - 36;
351     vi   = aj + diag[i] - 1;
352     nz   = diag[i] - ai[i];
353     idt  = 6*i;
354     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
355     s6 = x[5+idt];
356     while (nz--) {
357       idx   = 6*(*vi--);
358       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
359       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
360       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
361       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
362       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
363       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
364       v -= 36;
365     }
366   }
367   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
369   PLogFlops(2*36*(a->nz) - 6*a->n);
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNC__
374 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering"
375 int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
376 {
377   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
378   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
379   int             *diag = a->diag,oidx;
380   MatScalar       *aa=a->a,*v;
381   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
382   Scalar          *x,*b;
383 
384   PetscFunctionBegin;
385   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387 
388   /* forward solve the U^T */
389   idx = 0;
390   for (i=0; i<n; i++) {
391 
392     v     = aa + 49*diag[i];
393     /* multiply by the inverse of the block diagonal */
394     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
395     x6    = b[5+idx]; x7 = b[6+idx];
396     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
397     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
398     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
399     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
400     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
401     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
402     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
403     v += 49;
404 
405     vi    = aj + diag[i] + 1;
406     nz    = ai[i+1] - diag[i] - 1;
407     while (nz--) {
408       oidx = 7*(*vi++);
409       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
410       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
411       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
412       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
413       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
414       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
415       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
416       v  += 49;
417     }
418     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
419     x[5+idx] = s6;x[6+idx] = s7;
420     idx += 7;
421   }
422   /* backward solve the L^T */
423   for (i=n-1; i>=0; i--){
424     v    = aa + 49*diag[i] - 49;
425     vi   = aj + diag[i] - 1;
426     nz   = diag[i] - ai[i];
427     idt  = 7*i;
428     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
429     s6 = x[5+idt];s7 = x[6+idt];
430     while (nz--) {
431       idx   = 7*(*vi--);
432       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
433       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
434       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
435       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
436       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
437       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
438       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
439       v -= 49;
440     }
441   }
442   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
444   PLogFlops(2*49*(a->nz) - 7*a->n);
445   PetscFunctionReturn(0);
446 }
447 
448 /*---------------------------------------------------------------------------------------------*/
449 #undef __FUNC__
450 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1"
451 int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
452 {
453   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
454   IS              iscol=a->col,isrow=a->row;
455   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
456   int             *diag = a->diag;
457   MatScalar       *aa=a->a,*v;
458   Scalar          s1,*x,*b,*t;
459 
460   PetscFunctionBegin;
461   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
462   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
463   t  = a->solve_work;
464 
465   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
466   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
467 
468   /* copy the b into temp work space according to permutation */
469   for (i=0; i<n; i++) {
470     t[i] = b[c[i]];
471   }
472 
473   /* forward solve the U^T */
474   for (i=0; i<n; i++) {
475 
476     v     = aa + diag[i];
477     /* multiply by the inverse of the block diagonal */
478     s1    = (*v++)*t[i];
479     vi    = aj + diag[i] + 1;
480     nz    = ai[i+1] - diag[i] - 1;
481     while (nz--) {
482       t[*vi++]  -= (*v++)*s1;
483     }
484     t[i]   = s1;
485   }
486   /* backward solve the L^T */
487   for (i=n-1; i>=0; i--){
488     v    = aa + diag[i] - 1;
489     vi   = aj + diag[i] - 1;
490     nz   = diag[i] - ai[i];
491     s1   = t[i];
492     while (nz--) {
493       t[*vi--]   -=  (*v--)*s1;
494     }
495   }
496 
497   /* copy t into x according to permutation */
498   for (i=0; i<n; i++) {
499     x[r[i]]   = t[i];
500   }
501 
502   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
503   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
504   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
505   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
506   PLogFlops(2*(a->nz) - a->n);
507   PetscFunctionReturn(0);
508 }
509 
510 #undef __FUNC__
511 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2"
512 int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
513 {
514   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
515   IS              iscol=a->col,isrow=a->row;
516   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
517   int             *diag = a->diag,ii,ic,ir,oidx;
518   MatScalar       *aa=a->a,*v;
519   Scalar          s1,s2,x1,x2;
520   Scalar          *x,*b,*t;
521 
522   PetscFunctionBegin;
523   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
524   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
525   t  = a->solve_work;
526 
527   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
528   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
529 
530   /* copy the b into temp work space according to permutation */
531   ii = 0;
532   for (i=0; i<n; i++) {
533     ic      = 2*c[i];
534     t[ii]   = b[ic];
535     t[ii+1] = b[ic+1];
536     ii += 2;
537   }
538 
539   /* forward solve the U^T */
540   idx = 0;
541   for (i=0; i<n; i++) {
542 
543     v     = aa + 4*diag[i];
544     /* multiply by the inverse of the block diagonal */
545     x1    = t[idx];   x2 = t[1+idx];
546     s1 = v[0]*x1  +  v[1]*x2;
547     s2 = v[2]*x1  +  v[3]*x2;
548     v += 4;
549 
550     vi    = aj + diag[i] + 1;
551     nz    = ai[i+1] - diag[i] - 1;
552     while (nz--) {
553       oidx = 2*(*vi++);
554       t[oidx]   -= v[0]*s1  +  v[1]*s2;
555       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
556       v  += 4;
557     }
558     t[idx]   = s1;t[1+idx] = s2;
559     idx += 2;
560   }
561   /* backward solve the L^T */
562   for (i=n-1; i>=0; i--){
563     v    = aa + 4*diag[i] - 4;
564     vi   = aj + diag[i] - 1;
565     nz   = diag[i] - ai[i];
566     idt  = 2*i;
567     s1 = t[idt];  s2 = t[1+idt];
568     while (nz--) {
569       idx   = 2*(*vi--);
570       t[idx]   -=  v[0]*s1 +  v[1]*s2;
571       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
572       v -= 4;
573     }
574   }
575 
576   /* copy t into x according to permutation */
577   ii = 0;
578   for (i=0; i<n; i++) {
579     ir      = 2*r[i];
580     x[ir]   = t[ii];
581     x[ir+1] = t[ii+1];
582     ii += 2;
583   }
584 
585   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
586   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
587   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
588   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
589   PLogFlops(2*4*(a->nz) - 2*a->n);
590   PetscFunctionReturn(0);
591 }
592 
593 #undef __FUNC__
594 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3"
595 int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
596 {
597   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
598   IS              iscol=a->col,isrow=a->row;
599   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
600   int             *diag = a->diag,ii,ic,ir,oidx;
601   MatScalar       *aa=a->a,*v;
602   Scalar          s1,s2,s3,x1,x2,x3;
603   Scalar          *x,*b,*t;
604 
605   PetscFunctionBegin;
606   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
607   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
608   t  = a->solve_work;
609 
610   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
611   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
612 
613   /* copy the b into temp work space according to permutation */
614   ii = 0;
615   for (i=0; i<n; i++) {
616     ic      = 3*c[i];
617     t[ii]   = b[ic];
618     t[ii+1] = b[ic+1];
619     t[ii+2] = b[ic+2];
620     ii += 3;
621   }
622 
623   /* forward solve the U^T */
624   idx = 0;
625   for (i=0; i<n; i++) {
626 
627     v     = aa + 9*diag[i];
628     /* multiply by the inverse of the block diagonal */
629     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
630     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
631     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
632     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
633     v += 9;
634 
635     vi    = aj + diag[i] + 1;
636     nz    = ai[i+1] - diag[i] - 1;
637     while (nz--) {
638       oidx = 3*(*vi++);
639       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
640       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
641       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
642       v  += 9;
643     }
644     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
645     idx += 3;
646   }
647   /* backward solve the L^T */
648   for (i=n-1; i>=0; i--){
649     v    = aa + 9*diag[i] - 9;
650     vi   = aj + diag[i] - 1;
651     nz   = diag[i] - ai[i];
652     idt  = 3*i;
653     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
654     while (nz--) {
655       idx   = 3*(*vi--);
656       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
657       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
658       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
659       v -= 9;
660     }
661   }
662 
663   /* copy t into x according to permutation */
664   ii = 0;
665   for (i=0; i<n; i++) {
666     ir      = 3*r[i];
667     x[ir]   = t[ii];
668     x[ir+1] = t[ii+1];
669     x[ir+2] = t[ii+2];
670     ii += 3;
671   }
672 
673   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
674   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
675   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
676   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
677   PLogFlops(2*9*(a->nz) - 3*a->n);
678   PetscFunctionReturn(0);
679 }
680 
681 #undef __FUNC__
682 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4"
683 int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
684 {
685   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
686   IS              iscol=a->col,isrow=a->row;
687   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
688   int             *diag = a->diag,ii,ic,ir,oidx;
689   MatScalar       *aa=a->a,*v;
690   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
691   Scalar          *x,*b,*t;
692 
693   PetscFunctionBegin;
694   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
695   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
696   t  = a->solve_work;
697 
698   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
699   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
700 
701   /* copy the b into temp work space according to permutation */
702   ii = 0;
703   for (i=0; i<n; i++) {
704     ic      = 4*c[i];
705     t[ii]   = b[ic];
706     t[ii+1] = b[ic+1];
707     t[ii+2] = b[ic+2];
708     t[ii+3] = b[ic+3];
709     ii += 4;
710   }
711 
712   /* forward solve the U^T */
713   idx = 0;
714   for (i=0; i<n; i++) {
715 
716     v     = aa + 16*diag[i];
717     /* multiply by the inverse of the block diagonal */
718     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
719     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
720     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
721     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
722     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
723     v += 16;
724 
725     vi    = aj + diag[i] + 1;
726     nz    = ai[i+1] - diag[i] - 1;
727     while (nz--) {
728       oidx = 4*(*vi++);
729       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
730       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
731       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
732       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
733       v  += 16;
734     }
735     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
736     idx += 4;
737   }
738   /* backward solve the L^T */
739   for (i=n-1; i>=0; i--){
740     v    = aa + 16*diag[i] - 16;
741     vi   = aj + diag[i] - 1;
742     nz   = diag[i] - ai[i];
743     idt  = 4*i;
744     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
745     while (nz--) {
746       idx   = 4*(*vi--);
747       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
748       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
749       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
750       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
751       v -= 16;
752     }
753   }
754 
755   /* copy t into x according to permutation */
756   ii = 0;
757   for (i=0; i<n; i++) {
758     ir      = 4*r[i];
759     x[ir]   = t[ii];
760     x[ir+1] = t[ii+1];
761     x[ir+2] = t[ii+2];
762     x[ir+3] = t[ii+3];
763     ii += 4;
764   }
765 
766   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
767   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
768   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   PLogFlops(2*16*(a->nz) - 4*a->n);
771   PetscFunctionReturn(0);
772 }
773 
774 #undef __FUNC__
775 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5"
776 int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
777 {
778   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
779   IS              iscol=a->col,isrow=a->row;
780   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
781   int             *diag = a->diag,ii,ic,ir,oidx;
782   MatScalar       *aa=a->a,*v;
783   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
784   Scalar          *x,*b,*t;
785 
786   PetscFunctionBegin;
787   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
788   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
789   t  = a->solve_work;
790 
791   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
792   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
793 
794   /* copy the b into temp work space according to permutation */
795   ii = 0;
796   for (i=0; i<n; i++) {
797     ic      = 5*c[i];
798     t[ii]   = b[ic];
799     t[ii+1] = b[ic+1];
800     t[ii+2] = b[ic+2];
801     t[ii+3] = b[ic+3];
802     t[ii+4] = b[ic+4];
803     ii += 5;
804   }
805 
806   /* forward solve the U^T */
807   idx = 0;
808   for (i=0; i<n; i++) {
809 
810     v     = aa + 25*diag[i];
811     /* multiply by the inverse of the block diagonal */
812     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
813     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
814     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
815     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
816     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
817     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
818     v += 25;
819 
820     vi    = aj + diag[i] + 1;
821     nz    = ai[i+1] - diag[i] - 1;
822     while (nz--) {
823       oidx = 5*(*vi++);
824       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
825       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
826       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
827       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
828       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
829       v  += 25;
830     }
831     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
832     idx += 5;
833   }
834   /* backward solve the L^T */
835   for (i=n-1; i>=0; i--){
836     v    = aa + 25*diag[i] - 25;
837     vi   = aj + diag[i] - 1;
838     nz   = diag[i] - ai[i];
839     idt  = 5*i;
840     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
841     while (nz--) {
842       idx   = 5*(*vi--);
843       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
844       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
845       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
846       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
847       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
848       v -= 25;
849     }
850   }
851 
852   /* copy t into x according to permutation */
853   ii = 0;
854   for (i=0; i<n; i++) {
855     ir      = 5*r[i];
856     x[ir]   = t[ii];
857     x[ir+1] = t[ii+1];
858     x[ir+2] = t[ii+2];
859     x[ir+3] = t[ii+3];
860     x[ir+4] = t[ii+4];
861     ii += 5;
862   }
863 
864   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
865   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
866   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
867   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
868   PLogFlops(2*25*(a->nz) - 5*a->n);
869   PetscFunctionReturn(0);
870 }
871 
872 #undef __FUNC__
873 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6"
874 int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
875 {
876   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
877   IS              iscol=a->col,isrow=a->row;
878   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
879   int             *diag = a->diag,ii,ic,ir,oidx;
880   MatScalar       *aa=a->a,*v;
881   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
882   Scalar          *x,*b,*t;
883 
884   PetscFunctionBegin;
885   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
886   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
887   t  = a->solve_work;
888 
889   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
890   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
891 
892   /* copy the b into temp work space according to permutation */
893   ii = 0;
894   for (i=0; i<n; i++) {
895     ic      = 6*c[i];
896     t[ii]   = b[ic];
897     t[ii+1] = b[ic+1];
898     t[ii+2] = b[ic+2];
899     t[ii+3] = b[ic+3];
900     t[ii+4] = b[ic+4];
901     t[ii+5] = b[ic+5];
902     ii += 6;
903   }
904 
905   /* forward solve the U^T */
906   idx = 0;
907   for (i=0; i<n; i++) {
908 
909     v     = aa + 36*diag[i];
910     /* multiply by the inverse of the block diagonal */
911     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
912     x6    = t[5+idx];
913     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
914     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
915     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
916     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
917     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
918     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
919     v += 36;
920 
921     vi    = aj + diag[i] + 1;
922     nz    = ai[i+1] - diag[i] - 1;
923     while (nz--) {
924       oidx = 6*(*vi++);
925       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
926       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
927       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
928       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
929       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
930       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
931       v  += 36;
932     }
933     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
934     t[5+idx] = s6;
935     idx += 6;
936   }
937   /* backward solve the L^T */
938   for (i=n-1; i>=0; i--){
939     v    = aa + 36*diag[i] - 36;
940     vi   = aj + diag[i] - 1;
941     nz   = diag[i] - ai[i];
942     idt  = 6*i;
943     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
944     s6 = t[5+idt];
945     while (nz--) {
946       idx   = 6*(*vi--);
947       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
948       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
949       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
950       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
951       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
952       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
953       v -= 36;
954     }
955   }
956 
957   /* copy t into x according to permutation */
958   ii = 0;
959   for (i=0; i<n; i++) {
960     ir      = 6*r[i];
961     x[ir]   = t[ii];
962     x[ir+1] = t[ii+1];
963     x[ir+2] = t[ii+2];
964     x[ir+3] = t[ii+3];
965     x[ir+4] = t[ii+4];
966     x[ir+5] = t[ii+5];
967     ii += 6;
968   }
969 
970   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
971   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
972   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
973   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
974   PLogFlops(2*36*(a->nz) - 6*a->n);
975   PetscFunctionReturn(0);
976 }
977 
978 #undef __FUNC__
979 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7"
980 int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
981 {
982   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
983   IS              iscol=a->col,isrow=a->row;
984   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
985   int             *diag = a->diag,ii,ic,ir,oidx;
986   MatScalar       *aa=a->a,*v;
987   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
988   Scalar          *x,*b,*t;
989 
990   PetscFunctionBegin;
991   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
992   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
993   t  = a->solve_work;
994 
995   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
996   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
997 
998   /* copy the b into temp work space according to permutation */
999   ii = 0;
1000   for (i=0; i<n; i++) {
1001     ic      = 7*c[i];
1002     t[ii]   = b[ic];
1003     t[ii+1] = b[ic+1];
1004     t[ii+2] = b[ic+2];
1005     t[ii+3] = b[ic+3];
1006     t[ii+4] = b[ic+4];
1007     t[ii+5] = b[ic+5];
1008     t[ii+6] = b[ic+6];
1009     ii += 7;
1010   }
1011 
1012   /* forward solve the U^T */
1013   idx = 0;
1014   for (i=0; i<n; i++) {
1015 
1016     v     = aa + 49*diag[i];
1017     /* multiply by the inverse of the block diagonal */
1018     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1019     x6    = t[5+idx]; x7 = t[6+idx];
1020     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1021     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1022     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1023     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1024     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1025     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1026     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1027     v += 49;
1028 
1029     vi    = aj + diag[i] + 1;
1030     nz    = ai[i+1] - diag[i] - 1;
1031     while (nz--) {
1032       oidx = 7*(*vi++);
1033       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1034       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1035       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1036       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1037       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1038       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1039       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1040       v  += 49;
1041     }
1042     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1043     t[5+idx] = s6;t[6+idx] = s7;
1044     idx += 7;
1045   }
1046   /* backward solve the L^T */
1047   for (i=n-1; i>=0; i--){
1048     v    = aa + 49*diag[i] - 49;
1049     vi   = aj + diag[i] - 1;
1050     nz   = diag[i] - ai[i];
1051     idt  = 7*i;
1052     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1053     s6 = t[5+idt];s7 = t[6+idt];
1054     while (nz--) {
1055       idx   = 7*(*vi--);
1056       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1057       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1058       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1059       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1060       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1061       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1062       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1063       v -= 49;
1064     }
1065   }
1066 
1067   /* copy t into x according to permutation */
1068   ii = 0;
1069   for (i=0; i<n; i++) {
1070     ir      = 7*r[i];
1071     x[ir]   = t[ii];
1072     x[ir+1] = t[ii+1];
1073     x[ir+2] = t[ii+2];
1074     x[ir+3] = t[ii+3];
1075     x[ir+4] = t[ii+4];
1076     x[ir+5] = t[ii+5];
1077     x[ir+6] = t[ii+6];
1078     ii += 7;
1079   }
1080 
1081   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1082   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1084   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1085   PLogFlops(2*49*(a->nz) - 7*a->n);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* ----------------------------------------------------------- */
1090 #undef __FUNC__
1091 #define __FUNC__ "MatSolve_SeqSBAIJ_N"
1092 int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
1093 {
1094   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1095   IS              iscol=a->col,isrow=a->row;
1096   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1097   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1098   MatScalar       *aa=a->a,*v;
1099   Scalar          *x,*b,*s,*t,*ls;
1100 
1101   PetscFunctionBegin;
1102   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1103   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1104   t  = a->solve_work;
1105 
1106   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1107   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1108 
1109   /* forward solve the lower triangular */
1110   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1111   for (i=1; i<n; i++) {
1112     v   = aa + bs2*ai[i];
1113     vi  = aj + ai[i];
1114     nz  = a->diag[i] - ai[i];
1115     s = t + bs*i;
1116     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1117     while (nz--) {
1118       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1119       v += bs2;
1120     }
1121   }
1122   /* backward solve the upper triangular */
1123   ls = a->solve_work + a->n;
1124   for (i=n-1; i>=0; i--){
1125     v   = aa + bs2*(a->diag[i] + 1);
1126     vi  = aj + a->diag[i] + 1;
1127     nz  = ai[i+1] - a->diag[i] - 1;
1128     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1129     while (nz--) {
1130       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1131       v += bs2;
1132     }
1133     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1134     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1135   }
1136 
1137   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1138   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1140   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1141   PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 #undef __FUNC__
1146 #define __FUNC__ "MatSolve_SeqSBAIJ_7"
1147 int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
1148 {
1149   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1150   IS              iscol=a->col,isrow=a->row;
1151   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1152   int             *diag = a->diag;
1153   MatScalar       *aa=a->a,*v;
1154   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1155   Scalar          *x,*b,*t;
1156 
1157   PetscFunctionBegin;
1158   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1159   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1160   t  = a->solve_work;
1161 
1162   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1163   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1164 
1165   /* forward solve the lower triangular */
1166   idx    = 7*(*r++);
1167   t[0] = b[idx];   t[1] = b[1+idx];
1168   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1169   t[5] = b[5+idx]; t[6] = b[6+idx];
1170 
1171   for (i=1; i<n; i++) {
1172     v     = aa + 49*ai[i];
1173     vi    = aj + ai[i];
1174     nz    = diag[i] - ai[i];
1175     idx   = 7*(*r++);
1176     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1177     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1178     while (nz--) {
1179       idx   = 7*(*vi++);
1180       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1181       x4    = t[3+idx];x5 = t[4+idx];
1182       x6    = t[5+idx];x7 = t[6+idx];
1183       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1184       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1185       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1186       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1187       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1188       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1189       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1190       v += 49;
1191     }
1192     idx = 7*i;
1193     t[idx]   = s1;t[1+idx] = s2;
1194     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1195     t[5+idx] = s6;t[6+idx] = s7;
1196   }
1197   /* backward solve the upper triangular */
1198   for (i=n-1; i>=0; i--){
1199     v    = aa + 49*diag[i] + 49;
1200     vi   = aj + diag[i] + 1;
1201     nz   = ai[i+1] - diag[i] - 1;
1202     idt  = 7*i;
1203     s1 = t[idt];  s2 = t[1+idt];
1204     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1205     s6 = t[5+idt];s7 = t[6+idt];
1206     while (nz--) {
1207       idx   = 7*(*vi++);
1208       x1    = t[idx];   x2 = t[1+idx];
1209       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1210       x6    = t[5+idx]; x7 = t[6+idx];
1211       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1212       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1213       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1214       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1215       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1216       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1217       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1218       v += 49;
1219     }
1220     idc = 7*(*c--);
1221     v   = aa + 49*diag[i];
1222     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1223                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1224     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1225                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1226     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1227                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1228     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1229                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1230     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1231                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1232     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1233                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1234     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1235                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1236   }
1237 
1238   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1239   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1241   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1242   PLogFlops(2*49*(a->nz) - 7*a->n);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 #undef __FUNC__
1247 #define __FUNC__ "MatSolve_SeqSBAIJ_7_NaturalOrdering"
1248 int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1249 {
1250   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1251   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1252   int             ierr,*diag = a->diag,jdx;
1253   MatScalar       *aa=a->a,*v;
1254   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1255 
1256   PetscFunctionBegin;
1257   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1258   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1259   /* forward solve the lower triangular */
1260   idx    = 0;
1261   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1262   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1263   x[6] = b[6+idx];
1264   for (i=1; i<n; i++) {
1265     v     =  aa + 49*ai[i];
1266     vi    =  aj + ai[i];
1267     nz    =  diag[i] - ai[i];
1268     idx   =  7*i;
1269     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1270     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1271     s7  =  b[6+idx];
1272     while (nz--) {
1273       jdx   = 7*(*vi++);
1274       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1275       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1276       x7    = x[6+jdx];
1277       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1278       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1279       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1280       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1281       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1282       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1283       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1284       v += 49;
1285      }
1286     x[idx]   = s1;
1287     x[1+idx] = s2;
1288     x[2+idx] = s3;
1289     x[3+idx] = s4;
1290     x[4+idx] = s5;
1291     x[5+idx] = s6;
1292     x[6+idx] = s7;
1293   }
1294   /* backward solve the upper triangular */
1295   for (i=n-1; i>=0; i--){
1296     v    = aa + 49*diag[i] + 49;
1297     vi   = aj + diag[i] + 1;
1298     nz   = ai[i+1] - diag[i] - 1;
1299     idt  = 7*i;
1300     s1 = x[idt];   s2 = x[1+idt];
1301     s3 = x[2+idt]; s4 = x[3+idt];
1302     s5 = x[4+idt]; s6 = x[5+idt];
1303     s7 = x[6+idt];
1304     while (nz--) {
1305       idx   = 7*(*vi++);
1306       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1307       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1308       x7    = x[6+idx];
1309       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316       v += 49;
1317     }
1318     v        = aa + 49*diag[i];
1319     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1320                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1321     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1322                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1323     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1324                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1325     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1326                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1327     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1328                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1329     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1330                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1331     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1332                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1333   }
1334 
1335   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1336   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1337   PLogFlops(2*36*(a->nz) - 6*a->n);
1338   PetscFunctionReturn(0);
1339 }
1340 
1341 #undef __FUNC__
1342 #define __FUNC__ "MatSolve_SeqSBAIJ_6"
1343 int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
1344 {
1345   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1346   IS              iscol=a->col,isrow=a->row;
1347   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1348   int             *diag = a->diag;
1349   MatScalar       *aa=a->a,*v;
1350   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1351 
1352   PetscFunctionBegin;
1353   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1354   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1355   t  = a->solve_work;
1356 
1357   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1358   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1359 
1360   /* forward solve the lower triangular */
1361   idx    = 6*(*r++);
1362   t[0] = b[idx];   t[1] = b[1+idx];
1363   t[2] = b[2+idx]; t[3] = b[3+idx];
1364   t[4] = b[4+idx]; t[5] = b[5+idx];
1365   for (i=1; i<n; i++) {
1366     v     = aa + 36*ai[i];
1367     vi    = aj + ai[i];
1368     nz    = diag[i] - ai[i];
1369     idx   = 6*(*r++);
1370     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1371     s5  = b[4+idx]; s6 = b[5+idx];
1372     while (nz--) {
1373       idx   = 6*(*vi++);
1374       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1375       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1376       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1377       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1378       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1379       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1380       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1381       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1382       v += 36;
1383     }
1384     idx = 6*i;
1385     t[idx]   = s1;t[1+idx] = s2;
1386     t[2+idx] = s3;t[3+idx] = s4;
1387     t[4+idx] = s5;t[5+idx] = s6;
1388   }
1389   /* backward solve the upper triangular */
1390   for (i=n-1; i>=0; i--){
1391     v    = aa + 36*diag[i] + 36;
1392     vi   = aj + diag[i] + 1;
1393     nz   = ai[i+1] - diag[i] - 1;
1394     idt  = 6*i;
1395     s1 = t[idt];  s2 = t[1+idt];
1396     s3 = t[2+idt];s4 = t[3+idt];
1397     s5 = t[4+idt];s6 = t[5+idt];
1398     while (nz--) {
1399       idx   = 6*(*vi++);
1400       x1    = t[idx];   x2 = t[1+idx];
1401       x3    = t[2+idx]; x4 = t[3+idx];
1402       x5    = t[4+idx]; x6 = t[5+idx];
1403       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1404       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1405       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1406       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1407       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1408       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1409       v += 36;
1410     }
1411     idc = 6*(*c--);
1412     v   = aa + 36*diag[i];
1413     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1414                                  v[18]*s4+v[24]*s5+v[30]*s6;
1415     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1416                                  v[19]*s4+v[25]*s5+v[31]*s6;
1417     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1418                                  v[20]*s4+v[26]*s5+v[32]*s6;
1419     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1420                                  v[21]*s4+v[27]*s5+v[33]*s6;
1421     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1422                                  v[22]*s4+v[28]*s5+v[34]*s6;
1423     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1424                                  v[23]*s4+v[29]*s5+v[35]*s6;
1425   }
1426 
1427   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1428   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1430   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1431   PLogFlops(2*36*(a->nz) - 6*a->n);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 #undef __FUNC__
1436 #define __FUNC__ "MatSolve_SeqSBAIJ_6_NaturalOrdering"
1437 int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1438 {
1439   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1440   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1441   int             ierr,*diag = a->diag,jdx;
1442   MatScalar       *aa=a->a,*v;
1443   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1444 
1445   PetscFunctionBegin;
1446   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1447   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1448   /* forward solve the lower triangular */
1449   idx    = 0;
1450   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1451   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1452   for (i=1; i<n; i++) {
1453     v     =  aa + 36*ai[i];
1454     vi    =  aj + ai[i];
1455     nz    =  diag[i] - ai[i];
1456     idx   =  6*i;
1457     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1458     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1459     while (nz--) {
1460       jdx   = 6*(*vi++);
1461       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1462       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1463       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1464       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1465       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1466       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1467       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1468       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1469       v += 36;
1470      }
1471     x[idx]   = s1;
1472     x[1+idx] = s2;
1473     x[2+idx] = s3;
1474     x[3+idx] = s4;
1475     x[4+idx] = s5;
1476     x[5+idx] = s6;
1477   }
1478   /* backward solve the upper triangular */
1479   for (i=n-1; i>=0; i--){
1480     v    = aa + 36*diag[i] + 36;
1481     vi   = aj + diag[i] + 1;
1482     nz   = ai[i+1] - diag[i] - 1;
1483     idt  = 6*i;
1484     s1 = x[idt];   s2 = x[1+idt];
1485     s3 = x[2+idt]; s4 = x[3+idt];
1486     s5 = x[4+idt]; s6 = x[5+idt];
1487     while (nz--) {
1488       idx   = 6*(*vi++);
1489       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1490       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1491       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1492       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1493       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1494       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1495       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1496       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1497       v += 36;
1498     }
1499     v        = aa + 36*diag[i];
1500     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1501     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1502     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1503     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1504     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1505     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1506   }
1507 
1508   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1509   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1510   PLogFlops(2*36*(a->nz) - 6*a->n);
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 #undef __FUNC__
1515 #define __FUNC__ "MatSolve_SeqSBAIJ_5"
1516 int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
1517 {
1518   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1519   IS              iscol=a->col,isrow=a->row;
1520   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1521   int             *diag = a->diag;
1522   MatScalar       *aa=a->a,*v;
1523   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1524 
1525   PetscFunctionBegin;
1526   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1527   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1528   t  = a->solve_work;
1529 
1530   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1531   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1532 
1533   /* forward solve the lower triangular */
1534   idx    = 5*(*r++);
1535   t[0] = b[idx];   t[1] = b[1+idx];
1536   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1537   for (i=1; i<n; i++) {
1538     v     = aa + 25*ai[i];
1539     vi    = aj + ai[i];
1540     nz    = diag[i] - ai[i];
1541     idx   = 5*(*r++);
1542     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1543     s5  = b[4+idx];
1544     while (nz--) {
1545       idx   = 5*(*vi++);
1546       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1547       x4    = t[3+idx];x5 = t[4+idx];
1548       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1549       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1550       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1551       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1552       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1553       v += 25;
1554     }
1555     idx = 5*i;
1556     t[idx]   = s1;t[1+idx] = s2;
1557     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1558   }
1559   /* backward solve the upper triangular */
1560   for (i=n-1; i>=0; i--){
1561     v    = aa + 25*diag[i] + 25;
1562     vi   = aj + diag[i] + 1;
1563     nz   = ai[i+1] - diag[i] - 1;
1564     idt  = 5*i;
1565     s1 = t[idt];  s2 = t[1+idt];
1566     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1567     while (nz--) {
1568       idx   = 5*(*vi++);
1569       x1    = t[idx];   x2 = t[1+idx];
1570       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1571       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1572       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1573       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1574       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1575       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1576       v += 25;
1577     }
1578     idc = 5*(*c--);
1579     v   = aa + 25*diag[i];
1580     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1581                                  v[15]*s4+v[20]*s5;
1582     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1583                                  v[16]*s4+v[21]*s5;
1584     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1585                                  v[17]*s4+v[22]*s5;
1586     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1587                                  v[18]*s4+v[23]*s5;
1588     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1589                                  v[19]*s4+v[24]*s5;
1590   }
1591 
1592   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1593   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1595   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1596   PLogFlops(2*25*(a->nz) - 5*a->n);
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 #undef __FUNC__
1601 #define __FUNC__ "MatSolve_SeqSBAIJ_5_NaturalOrdering"
1602 int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1603 {
1604   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1605   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1606   int             ierr,*diag = a->diag,jdx;
1607   MatScalar       *aa=a->a,*v;
1608   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1609 
1610   PetscFunctionBegin;
1611   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1612   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1613   /* forward solve the lower triangular */
1614   idx    = 0;
1615   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1616   for (i=1; i<n; i++) {
1617     v     =  aa + 25*ai[i];
1618     vi    =  aj + ai[i];
1619     nz    =  diag[i] - ai[i];
1620     idx   =  5*i;
1621     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1622     while (nz--) {
1623       jdx   = 5*(*vi++);
1624       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1625       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1626       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1627       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1628       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1629       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1630       v    += 25;
1631     }
1632     x[idx]   = s1;
1633     x[1+idx] = s2;
1634     x[2+idx] = s3;
1635     x[3+idx] = s4;
1636     x[4+idx] = s5;
1637   }
1638   /* backward solve the upper triangular */
1639   for (i=n-1; i>=0; i--){
1640     v    = aa + 25*diag[i] + 25;
1641     vi   = aj + diag[i] + 1;
1642     nz   = ai[i+1] - diag[i] - 1;
1643     idt  = 5*i;
1644     s1 = x[idt];  s2 = x[1+idt];
1645     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1646     while (nz--) {
1647       idx   = 5*(*vi++);
1648       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1649       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1650       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1651       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1652       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1653       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1654       v    += 25;
1655     }
1656     v        = aa + 25*diag[i];
1657     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1658     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1659     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1660     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1661     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1662   }
1663 
1664   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1665   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1666   PLogFlops(2*25*(a->nz) - 5*a->n);
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 #undef __FUNC__
1671 #define __FUNC__ "MatSolve_SeqSBAIJ_4"
1672 int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
1673 {
1674   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1675   IS              iscol=a->col,isrow=a->row;
1676   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1677   int             *diag = a->diag;
1678   MatScalar       *aa=a->a,*v;
1679   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1680 
1681   PetscFunctionBegin;
1682   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1683   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1684   t  = a->solve_work;
1685 
1686   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1687   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1688 
1689   /* forward solve the lower triangular */
1690   idx    = 4*(*r++);
1691   t[0] = b[idx];   t[1] = b[1+idx];
1692   t[2] = b[2+idx]; t[3] = b[3+idx];
1693   for (i=1; i<n; i++) {
1694     v     = aa + 16*ai[i];
1695     vi    = aj + ai[i];
1696     nz    = diag[i] - ai[i];
1697     idx   = 4*(*r++);
1698     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1699     while (nz--) {
1700       idx   = 4*(*vi++);
1701       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1702       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1703       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1704       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1705       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1706       v    += 16;
1707     }
1708     idx        = 4*i;
1709     t[idx]   = s1;t[1+idx] = s2;
1710     t[2+idx] = s3;t[3+idx] = s4;
1711   }
1712   /* backward solve the upper triangular */
1713   for (i=n-1; i>=0; i--){
1714     v    = aa + 16*diag[i] + 16;
1715     vi   = aj + diag[i] + 1;
1716     nz   = ai[i+1] - diag[i] - 1;
1717     idt  = 4*i;
1718     s1 = t[idt];  s2 = t[1+idt];
1719     s3 = t[2+idt];s4 = t[3+idt];
1720     while (nz--) {
1721       idx   = 4*(*vi++);
1722       x1    = t[idx];   x2 = t[1+idx];
1723       x3    = t[2+idx]; x4 = t[3+idx];
1724       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1725       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1726       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1727       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1728       v += 16;
1729     }
1730     idc      = 4*(*c--);
1731     v        = aa + 16*diag[i];
1732     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1733     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1734     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1735     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   PLogFlops(2*16*(a->nz) - 4*a->n);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 
1747 /*
1748       Special case where the matrix was ILU(0) factored in the natural
1749    ordering. This eliminates the need for the column and row permutation.
1750 */
1751 #undef __FUNC__
1752 #define __FUNC__ "MatSolve_SeqSBAIJ_4_NaturalOrdering"
1753 int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1754 {
1755   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1756   int             n=a->mbs,*ai=a->i,*aj=a->j;
1757   int             ierr,*diag = a->diag;
1758   MatScalar       *aa=a->a;
1759   Scalar          *x,*b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764 
1765 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1766   {
1767     static Scalar w[2000]; /* very BAD need to fix */
1768     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1769   }
1770 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1771   {
1772     static Scalar w[2000]; /* very BAD need to fix */
1773     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1774   }
1775 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1776   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1777 #else
1778   {
1779     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1780     MatScalar *v;
1781     int       jdx,idt,idx,nz,*vi,i,ai16;
1782 
1783   /* forward solve the lower triangular */
1784   idx    = 0;
1785   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1786   for (i=1; i<n; i++) {
1787     v     =  aa      + 16*ai[i];
1788     vi    =  aj      + ai[i];
1789     nz    =  diag[i] - ai[i];
1790     idx   +=  4;
1791     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1792     while (nz--) {
1793       jdx   = 4*(*vi++);
1794       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1795       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1796       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1797       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1798       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1799       v    += 16;
1800     }
1801     x[idx]   = s1;
1802     x[1+idx] = s2;
1803     x[2+idx] = s3;
1804     x[3+idx] = s4;
1805   }
1806   /* backward solve the upper triangular */
1807   idt = 4*(n-1);
1808   for (i=n-1; i>=0; i--){
1809     ai16 = 16*diag[i];
1810     v    = aa + ai16 + 16;
1811     vi   = aj + diag[i] + 1;
1812     nz   = ai[i+1] - diag[i] - 1;
1813     s1 = x[idt];  s2 = x[1+idt];
1814     s3 = x[2+idt];s4 = x[3+idt];
1815     while (nz--) {
1816       idx   = 4*(*vi++);
1817       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1818       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1819       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1820       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1821       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1822       v    += 16;
1823     }
1824     v        = aa + ai16;
1825     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1826     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1827     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1828     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1829     idt -= 4;
1830   }
1831   }
1832 #endif
1833 
1834   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1835   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1836   PLogFlops(2*16*(a->nz) - 4*a->n);
1837   PetscFunctionReturn(0);
1838 }
1839 
1840 #undef __FUNC__
1841 #define __FUNC__ "MatSolve_SeqSBAIJ_3"
1842 int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
1843 {
1844   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1845   IS              iscol=a->col,isrow=a->row;
1846   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1847   int             *diag = a->diag;
1848   MatScalar       *aa=a->a,*v;
1849   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
1850 
1851   PetscFunctionBegin;
1852   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1853   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1854   t  = a->solve_work;
1855 
1856   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1857   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1858 
1859   /* forward solve the lower triangular */
1860   idx    = 3*(*r++);
1861   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1862   for (i=1; i<n; i++) {
1863     v     = aa + 9*ai[i];
1864     vi    = aj + ai[i];
1865     nz    = diag[i] - ai[i];
1866     idx   = 3*(*r++);
1867     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1868     while (nz--) {
1869       idx   = 3*(*vi++);
1870       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1871       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1872       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1873       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1874       v += 9;
1875     }
1876     idx = 3*i;
1877     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1878   }
1879   /* backward solve the upper triangular */
1880   for (i=n-1; i>=0; i--){
1881     v    = aa + 9*diag[i] + 9;
1882     vi   = aj + diag[i] + 1;
1883     nz   = ai[i+1] - diag[i] - 1;
1884     idt  = 3*i;
1885     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1886     while (nz--) {
1887       idx   = 3*(*vi++);
1888       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1889       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1890       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1891       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1892       v += 9;
1893     }
1894     idc = 3*(*c--);
1895     v   = aa + 9*diag[i];
1896     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1897     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1898     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1899   }
1900   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1901   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1902   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1903   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1904   PLogFlops(2*9*(a->nz) - 3*a->n);
1905   PetscFunctionReturn(0);
1906 }
1907 
1908 /*
1909       Special case where the matrix was ILU(0) factored in the natural
1910    ordering. This eliminates the need for the column and row permutation.
1911 */
1912 #undef __FUNC__
1913 #define __FUNC__ "MatSolve_SeqSBAIJ_3_NaturalOrdering"
1914 int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1915 {
1916   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1917   int             n=a->mbs,*ai=a->i,*aj=a->j;
1918   int             ierr,*diag = a->diag;
1919   MatScalar       *aa=a->a,*v;
1920   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
1921   int             jdx,idt,idx,nz,*vi,i;
1922 
1923   PetscFunctionBegin;
1924   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1925   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1926 
1927 
1928   /* forward solve the lower triangular */
1929   idx    = 0;
1930   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
1931   for (i=1; i<n; i++) {
1932     v     =  aa      + 9*ai[i];
1933     vi    =  aj      + ai[i];
1934     nz    =  diag[i] - ai[i];
1935     idx   +=  3;
1936     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1937     while (nz--) {
1938       jdx   = 3*(*vi++);
1939       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1940       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1941       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1942       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1943       v    += 9;
1944     }
1945     x[idx]   = s1;
1946     x[1+idx] = s2;
1947     x[2+idx] = s3;
1948   }
1949   /* backward solve the upper triangular */
1950   for (i=n-1; i>=0; i--){
1951     v    = aa + 9*diag[i] + 9;
1952     vi   = aj + diag[i] + 1;
1953     nz   = ai[i+1] - diag[i] - 1;
1954     idt  = 3*i;
1955     s1 = x[idt];  s2 = x[1+idt];
1956     s3 = x[2+idt];
1957     while (nz--) {
1958       idx   = 3*(*vi++);
1959       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1960       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1961       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1962       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1963       v    += 9;
1964     }
1965     v        = aa +  9*diag[i];
1966     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1967     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1968     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1969   }
1970 
1971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1973   PLogFlops(2*9*(a->nz) - 3*a->n);
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 #undef __FUNC__
1978 #define __FUNC__ "MatSolve_SeqSBAIJ_2"
1979 int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1980 {
1981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1982   IS              iscol=a->col,isrow=a->row;
1983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1984   int             *diag = a->diag;
1985   MatScalar       *aa=a->a,*v;
1986   Scalar          *x,*b,s1,s2,x1,x2,*t;
1987 
1988   PetscFunctionBegin;
1989   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1990   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1991   t  = a->solve_work;
1992 
1993   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1994   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1995 
1996   /* forward solve the lower triangular */
1997   idx    = 2*(*r++);
1998   t[0] = b[idx]; t[1] = b[1+idx];
1999   for (i=1; i<n; i++) {
2000     v     = aa + 4*ai[i];
2001     vi    = aj + ai[i];
2002     nz    = diag[i] - ai[i];
2003     idx   = 2*(*r++);
2004     s1  = b[idx]; s2 = b[1+idx];
2005     while (nz--) {
2006       idx   = 2*(*vi++);
2007       x1    = t[idx]; x2 = t[1+idx];
2008       s1 -= v[0]*x1 + v[2]*x2;
2009       s2 -= v[1]*x1 + v[3]*x2;
2010       v += 4;
2011     }
2012     idx = 2*i;
2013     t[idx] = s1; t[1+idx] = s2;
2014   }
2015   /* backward solve the upper triangular */
2016   for (i=n-1; i>=0; i--){
2017     v    = aa + 4*diag[i] + 4;
2018     vi   = aj + diag[i] + 1;
2019     nz   = ai[i+1] - diag[i] - 1;
2020     idt  = 2*i;
2021     s1 = t[idt]; s2 = t[1+idt];
2022     while (nz--) {
2023       idx   = 2*(*vi++);
2024       x1    = t[idx]; x2 = t[1+idx];
2025       s1 -= v[0]*x1 + v[2]*x2;
2026       s2 -= v[1]*x1 + v[3]*x2;
2027       v += 4;
2028     }
2029     idc = 2*(*c--);
2030     v   = aa + 4*diag[i];
2031     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2032     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2033   }
2034   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2035   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2036   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2038   PLogFlops(2*4*(a->nz) - 2*a->n);
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 /*
2043       Special case where the matrix was ILU(0) factored in the natural
2044    ordering. This eliminates the need for the column and row permutation.
2045 */
2046 #undef __FUNC__
2047 #define __FUNC__ "MatSolve_SeqSBAIJ_2_NaturalOrdering"
2048 int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2049 {
2050   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2051   int             n=a->mbs,*ai=a->i,*aj=a->j;
2052   int             ierr,*diag = a->diag;
2053   MatScalar       *aa=a->a,*v;
2054   Scalar          *x,*b,s1,s2,x1,x2;
2055   int             jdx,idt,idx,nz,*vi,i;
2056 
2057   PetscFunctionBegin;
2058   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2059   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2060 
2061   /* forward solve the lower triangular */
2062   idx    = 0;
2063   x[0]   = b[0]; x[1] = b[1];
2064   for (i=1; i<n; i++) {
2065     v     =  aa      + 4*ai[i];
2066     vi    =  aj      + ai[i];
2067     nz    =  diag[i] - ai[i];
2068     idx   +=  2;
2069     s1  =  b[idx];s2 = b[1+idx];
2070     while (nz--) {
2071       jdx   = 2*(*vi++);
2072       x1    = x[jdx];x2 = x[1+jdx];
2073       s1 -= v[0]*x1 + v[2]*x2;
2074       s2 -= v[1]*x1 + v[3]*x2;
2075       v    += 4;
2076     }
2077     x[idx]   = s1;
2078     x[1+idx] = s2;
2079   }
2080   /* backward solve the upper triangular */
2081   for (i=n-1; i>=0; i--){
2082     v    = aa + 4*diag[i] + 4;
2083     vi   = aj + diag[i] + 1;
2084     nz   = ai[i+1] - diag[i] - 1;
2085     idt  = 2*i;
2086     s1 = x[idt];  s2 = x[1+idt];
2087     while (nz--) {
2088       idx   = 2*(*vi++);
2089       x1    = x[idx];   x2 = x[1+idx];
2090       s1 -= v[0]*x1 + v[2]*x2;
2091       s2 -= v[1]*x1 + v[3]*x2;
2092       v    += 4;
2093     }
2094     v        = aa +  4*diag[i];
2095     x[idt]   = v[0]*s1 + v[2]*s2;
2096     x[1+idt] = v[1]*s1 + v[3]*s2;
2097   }
2098 
2099   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2100   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2101   PLogFlops(2*4*(a->nz) - 2*a->n);
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 /*----------- New 1 --------------*/
2106 #undef __FUNC__
2107 #define __FUNC__ "MatSolve_SeqSBAIJ_1"
2108 int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
2109 {
2110   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
2111   IS              isrow=a->row;
2112   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
2113   MatScalar       *aa=a->a,*v;
2114   Scalar          *x,*b,xk,*xtmp;
2115   int             nz,*vj,k;
2116 
2117   PetscFunctionBegin;
2118   if (!mbs) PetscFunctionReturn(0);
2119 
2120   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2121   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122   xtmp = a->solve_work;
2123 
2124   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
2125 
2126   /* solve U'*D*y = b by forward substitution */
2127   for (k=0; k<mbs; k++) xtmp[k] = b[rip[k]];
2128   for (k=0; k<mbs; k++){
2129     v  = aa + ai[k];
2130     vj = aj + ai[k];
2131     xk = xtmp[k];
2132     nz = ai[k+1] - ai[k];
2133     while (nz--) xtmp[*vj++] += (*v++) * xk;
2134     xtmp[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
2135   }
2136 
2137   /* solve U*x = y by back substitution */
2138 
2139   for (k=mbs-1; k>=0; k--){
2140     v  = aa + ai[k];
2141     vj = aj + ai[k];
2142     xk = xtmp[k];
2143     nz = ai[k+1] - ai[k];
2144     while (nz--) xk += (*v++) * xtmp[*vj++];
2145     xtmp[k] = xk;
2146     x[rip[k]] = xk;
2147   }
2148 
2149   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
2150   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2151   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2152   PLogFlops(4*a->s_nz + a->m);
2153   PetscFunctionReturn(0);
2154 }
2155 
2156 
2157 #ifdef CONT
2158   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2159   IS              iscol=a->col,isrow=a->row;
2160   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2161   int             *diag = a->diag;
2162   MatScalar       *aa=a->a,*v;
2163   Scalar          *x,*b,s1,*t;
2164 
2165   PetscFunctionBegin;
2166   if (!n) PetscFunctionReturn(0);
2167 
2168   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2169   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2170   t  = a->solve_work;
2171 
2172   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2173   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2174 
2175   /* forward solve the lower triangular */
2176   t[0] = b[*r++];
2177   for (i=1; i<n; i++) {
2178     v     = aa + ai[i];
2179     vi    = aj + ai[i];
2180     nz    = diag[i] - ai[i];
2181     s1  = b[*r++];
2182     while (nz--) {
2183       s1 -= (*v++)*t[*vi++];
2184     }
2185     t[i] = s1;
2186   }
2187   /* backward solve the upper triangular */
2188   for (i=n-1; i>=0; i--){
2189     v    = aa + diag[i] + 1;
2190     vi   = aj + diag[i] + 1;
2191     nz   = ai[i+1] - diag[i] - 1;
2192     s1 = t[i];
2193     while (nz--) {
2194       s1 -= (*v++)*t[*vi++];
2195     }
2196     x[*c--] = t[i] = aa[diag[i]]*s1;
2197   }
2198 
2199   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2200   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2201   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2202   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2203   PLogFlops(2*1*(a->nz) - a->n);
2204   PetscFunctionReturn(0);
2205 }
2206 
2207 #endif
2208 /*----------- New 1 --------------*/
2209 /*
2210       Special case where the matrix was ILU(0) factored in the natural
2211    ordering. This eliminates the need for the column and row permutation.
2212 */
2213 #undef __FUNC__
2214 #define __FUNC__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
2215 int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2216 {
2217   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
2218   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
2219   MatScalar       *aa=a->a,*v;
2220   Scalar          *x,*b,xk;
2221   int             nz,*vj,k;
2222 
2223   PetscFunctionBegin;
2224   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2225   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2226 
2227   /* solve U'*D*y = b by forward substitution */
2228   for (k=0; k<mbs; k++) x[k] = b[k];
2229   for (k=0; k<mbs; k++){
2230     v  = aa + ai[k];
2231     vj = aj + ai[k];
2232     xk = x[k];
2233     nz = ai[k+1] - ai[k];
2234     while (nz--) x[*vj++] += (*v++) * xk;
2235     x[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
2236   }
2237 
2238   /* solve U*x = y by back substitution */
2239   for (k=mbs-2; k>=0; k--){
2240     v  = aa + ai[k];
2241     vj = aj + ai[k];
2242     xk = x[k];
2243     nz = ai[k+1] - ai[k];
2244     while (nz--) xk += (*v++) * x[*vj++];
2245     x[k] = xk;
2246   }
2247 
2248   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2250   PLogFlops(4*a->s_nz + a->m);
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /* ----------------------------------------------------------------*/
2255 #undef __FUNC__
2256 #define __FUNC__ "MatILUFactorSymbolic_SeqSBAIJ"
2257 int MatILUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *B)
2258 {
2259   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2260   IS          isicol;
2261   int         *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
2262   int         *jutmp,bs = a->bs,bs2=a->bs2;
2263   int         m,nzi,realloc = 0,*levtmp;
2264   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2265   int         levels,incrlev,*lev,lev_ik,shift;
2266   PetscReal  f;
2267 
2268   PetscFunctionBegin;
2269   if (info) {
2270     f             = info->fill;
2271     levels        = (int)info->levels;
2272   } else {
2273     f             = 1.0;
2274     levels        = 0;
2275   }
2276   PetscValidHeaderSpecific(isrow,IS_COOKIE);
2277   PetscValidHeaderSpecific(iscol,IS_COOKIE);
2278   /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/
2279   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2280   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
2281   ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr);
2282   for (k=0; k<mbs; k++) {
2283     if ( rip[k] - riip[k] != 0 ) {
2284       printf("Non-symm. permutation, use symm. permutation or general matrix format\n");
2285       break;
2286     }
2287   }
2288 
2289   /* initialization */
2290   /* Don't know how many column pointers are needed so estimate.
2291      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
2292 
2293   umax = (int)(f*ai[mbs] + 1);
2294   lev = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(lev);
2295   umax += mbs + 1;
2296   shift = mbs + 1;
2297   ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
2298   iu[0] = mbs+1;
2299   juptr = mbs;
2300   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2301   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
2302   levtmp =  (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(levtmp);
2303   for (i=0; i<mbs; i++){
2304     jl[i] = mbs; q[i] = 0;
2305   }
2306 
2307   /* for each row k */
2308   for (k=0; k<mbs; k++){
2309     nzk = 0;
2310     q[k] = mbs;
2311     /* initialize nonzero structure of k-th row to row rip[k] of A */
2312     jmin = ai[rip[k]];
2313     jmax = ai[rip[k]+1];
2314     for (j=jmin; j<jmax; j++){
2315       vj = riip[aj[j]];
2316       if(vj > k){
2317         qm = k;
2318         do {
2319           m  = qm; qm = q[m];
2320         } while(qm < vj);
2321         if (qm == vj) {
2322           printf(" error: duplicate entry in A\n"); break;
2323         }
2324         nzk++;
2325         q[m]   = vj;
2326         q[vj]  = qm;
2327         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2328       } /* if(vj > k) */
2329     } /* for (j=jmin; j<jmax; j++) */
2330 
2331     /* modify nonzero structure of k-th row by computing fill-in
2332        for each row i to be merged in */
2333     i = k;
2334     i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */
2335     /* printf(" next pivot row i=%d\n",i); */
2336     while (i < mbs){
2337       /* merge row i into k-th row */
2338       j=iu[i];
2339       vj = ju[j];
2340       lev_ik = lev[j-shift];
2341       nzi = iu[i+1] - (iu[i]+1);
2342       jmin = iu[i] + 1; jmax = iu[i] + nzi;
2343       qm = k;
2344       for (j=jmin; j<jmax+1; j++){
2345         vj = ju[j];
2346         incrlev = lev[j-shift]+lev_ik+1;
2347 
2348         if (incrlev > levels) continue;
2349         do {
2350           m = qm; qm = q[m];
2351         } while (qm < vj);
2352         if (qm != vj){      /* a fill */
2353           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2354           levtmp[vj] = incrlev;
2355         }
2356         else {              /* already a nonzero element */
2357           if (levtmp[vj]>incrlev) levtmp[vj] = incrlev;
2358         }
2359       }
2360       i = jl[i]; /* next pivot row */
2361     }
2362 
2363     /* add k to row list for first nonzero element in k-th row */
2364     if (nzk > 1){
2365       i = q[k]; /* col value of first nonzero element in k_th row of U */
2366       jl[k] = jl[i]; jl[i] = k;
2367     }
2368     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
2369 
2370     /* allocate more space to ju and lev if needed */
2371     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
2372       /* estimate how much additional space we will need */
2373       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2374       /* just double the memory each time */
2375       maxadd = umax;
2376       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2377       umax += maxadd;
2378 
2379       /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */
2380       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
2381       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
2382       ierr = PetscFree(ju);CHKERRQ(ierr);
2383       ju = iu = jutmp;
2384 
2385       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
2386       ierr  = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));CHKERRQ(ierr);
2387       ierr = PetscFree(lev);CHKERRQ(ierr);
2388       lev = jutmp;
2389 
2390       realloc += 2; /* count how many times we realloc */
2391     }
2392 
2393     /* save nonzero structure of k-th row in ju */
2394     i=k;
2395     jumin = juptr + 1; juptr += nzk;
2396     for (j=jumin; j<juptr+1; j++){
2397       i      = q[i];
2398       ju[j]  = i;
2399       lev[j-shift] = levtmp[i];
2400       /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/
2401     }
2402     /* printf("\n");  */
2403   } /* for (k=0; k<mbs; k++) */
2404 
2405   if (ai[mbs] != 0) {
2406     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2407     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2408     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
2409     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
2410     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n");
2411   } else {
2412      PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
2413   }
2414 
2415   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
2416   ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr);
2417 
2418   ierr = PetscFree(q);CHKERRQ(ierr);
2419   ierr = PetscFree(jl);CHKERRQ(ierr);
2420 
2421   /* put together the new matrix */
2422   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
2423   PLogObjectParent(*B,isicol);
2424   b = (Mat_SeqSBAIJ*)(*B)->data;
2425   ierr = PetscFree(b->imax);CHKERRQ(ierr);
2426   b->singlemalloc = PETSC_FALSE;
2427   /* the next line frees the default space generated by the Create() */
2428   ierr = PetscFree(b->a);CHKERRQ(ierr);
2429   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2430   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
2431   b->j          = ju;
2432   b->i          = iu;
2433   b->diag       = 0;
2434   b->ilen       = 0;
2435   b->imax       = 0;
2436   b->row        = isrow;
2437   b->col        = iscol;
2438   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2439   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2440   b->icol       = isicol;
2441   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2442   /* In b structure:  Free imax, ilen, old a, old j.
2443      Allocate idnew, solve_work, new a, new j */
2444   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
2445   b->s_maxnz = b->s_nz = iu[mbs];
2446 
2447   (*B)->factor                 = FACTOR_LU;
2448   (*B)->info.factor_mallocs    = realloc;
2449   (*B)->info.fill_ratio_given  = f;
2450   if (ai[mbs] != 0) {
2451     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2452   } else {
2453     (*B)->info.fill_ratio_needed = 0.0;
2454   }
2455 
2456   PetscFunctionReturn(0);
2457 }
2458 
2459 
2460 
2461