xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision f7cf7585ec4ab5bbe01f2b7df7e992ca325859cc)
1 /*$Id: sbaijfact2.c,v 1.2 2000/06/25 17:07:00 balay Exp hzhang $*/
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include "sbaij.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/ilu.h"
10 #include "src/inline/dot.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering"
14 int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
17   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   int             *diag = a->diag;
19   MatScalar       *aa=a->a,*v;
20   Scalar          s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*b[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   PLogFlops(2*(a->nz) - a->n);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNC__
56 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering"
57 int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61   int             *diag = a->diag,oidx;
62   MatScalar       *aa=a->a,*v;
63   Scalar          s1,s2,x1,x2;
64   Scalar          *x,*b;
65 
66   PetscFunctionBegin;
67   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
68   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
69 
70   /* forward solve the U^T */
71   idx = 0;
72   for (i=0; i<n; i++) {
73 
74     v     = aa + 4*diag[i];
75     /* multiply by the inverse of the block diagonal */
76     x1 = b[idx];   x2 = b[1+idx];
77     s1 = v[0]*x1  +  v[1]*x2;
78     s2 = v[2]*x1  +  v[3]*x2;
79     v += 4;
80 
81     vi    = aj + diag[i] + 1;
82     nz    = ai[i+1] - diag[i] - 1;
83     while (nz--) {
84       oidx = 2*(*vi++);
85       x[oidx]   -= v[0]*s1  +  v[1]*s2;
86       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
87       v  += 4;
88     }
89     x[idx]   = s1;x[1+idx] = s2;
90     idx += 2;
91   }
92   /* backward solve the L^T */
93   for (i=n-1; i>=0; i--){
94     v    = aa + 4*diag[i] - 4;
95     vi   = aj + diag[i] - 1;
96     nz   = diag[i] - ai[i];
97     idt  = 2*i;
98     s1   = x[idt];  s2 = x[1+idt];
99     while (nz--) {
100       idx   = 2*(*vi--);
101       x[idx]   -=  v[0]*s1 +  v[1]*s2;
102       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
103       v -= 4;
104     }
105   }
106   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
107   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
108   PLogFlops(2*4*(a->nz) - 2*a->n);
109   PetscFunctionReturn(0);
110 }
111 
112 #undef __FUNC__
113 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering"
114 int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115 {
116   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
117   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118   int             *diag = a->diag,oidx;
119   MatScalar       *aa=a->a,*v;
120   Scalar          s1,s2,s3,x1,x2,x3;
121   Scalar          *x,*b;
122 
123   PetscFunctionBegin;
124   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126 
127   /* forward solve the U^T */
128   idx = 0;
129   for (i=0; i<n; i++) {
130 
131     v     = aa + 9*diag[i];
132     /* multiply by the inverse of the block diagonal */
133     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
137     v += 9;
138 
139     vi    = aj + diag[i] + 1;
140     nz    = ai[i+1] - diag[i] - 1;
141     while (nz--) {
142       oidx = 3*(*vi++);
143       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
144       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
145       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
146       v  += 9;
147     }
148     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
149     idx += 3;
150   }
151   /* backward solve the L^T */
152   for (i=n-1; i>=0; i--){
153     v    = aa + 9*diag[i] - 9;
154     vi   = aj + diag[i] - 1;
155     nz   = diag[i] - ai[i];
156     idt  = 3*i;
157     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
158     while (nz--) {
159       idx   = 3*(*vi--);
160       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
161       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
162       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
163       v -= 9;
164     }
165   }
166   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
168   PLogFlops(2*9*(a->nz) - 3*a->n);
169   PetscFunctionReturn(0);
170 }
171 
172 #undef __FUNC__
173 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering"
174 int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
175 {
176   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
177   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
178   int             *diag = a->diag,oidx;
179   MatScalar       *aa=a->a,*v;
180   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
181   Scalar          *x,*b;
182 
183   PetscFunctionBegin;
184   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
186 
187   /* forward solve the U^T */
188   idx = 0;
189   for (i=0; i<n; i++) {
190 
191     v     = aa + 16*diag[i];
192     /* multiply by the inverse of the block diagonal */
193     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
194     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
195     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
196     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
197     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
198     v += 16;
199 
200     vi    = aj + diag[i] + 1;
201     nz    = ai[i+1] - diag[i] - 1;
202     while (nz--) {
203       oidx = 4*(*vi++);
204       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
205       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
206       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
207       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
208       v  += 16;
209     }
210     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
211     idx += 4;
212   }
213   /* backward solve the L^T */
214   for (i=n-1; i>=0; i--){
215     v    = aa + 16*diag[i] - 16;
216     vi   = aj + diag[i] - 1;
217     nz   = diag[i] - ai[i];
218     idt  = 4*i;
219     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
220     while (nz--) {
221       idx   = 4*(*vi--);
222       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
223       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
224       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
225       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
226       v -= 16;
227     }
228   }
229   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
230   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
231   PLogFlops(2*16*(a->nz) - 4*a->n);
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNC__
236 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering"
237 int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
238 {
239   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
240   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
241   int             *diag = a->diag,oidx;
242   MatScalar       *aa=a->a,*v;
243   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
244   Scalar          *x,*b;
245 
246   PetscFunctionBegin;
247   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249 
250   /* forward solve the U^T */
251   idx = 0;
252   for (i=0; i<n; i++) {
253 
254     v     = aa + 25*diag[i];
255     /* multiply by the inverse of the block diagonal */
256     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
257     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
258     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
259     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
260     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
261     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
262     v += 25;
263 
264     vi    = aj + diag[i] + 1;
265     nz    = ai[i+1] - diag[i] - 1;
266     while (nz--) {
267       oidx = 5*(*vi++);
268       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
269       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
270       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
271       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
272       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
273       v  += 25;
274     }
275     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
276     idx += 5;
277   }
278   /* backward solve the L^T */
279   for (i=n-1; i>=0; i--){
280     v    = aa + 25*diag[i] - 25;
281     vi   = aj + diag[i] - 1;
282     nz   = diag[i] - ai[i];
283     idt  = 5*i;
284     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
285     while (nz--) {
286       idx   = 5*(*vi--);
287       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
288       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
289       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
290       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
291       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
292       v -= 25;
293     }
294   }
295   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
296   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
297   PLogFlops(2*25*(a->nz) - 5*a->n);
298   PetscFunctionReturn(0);
299 }
300 
301 #undef __FUNC__
302 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering"
303 int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
304 {
305   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
306   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   int             *diag = a->diag,oidx;
308   MatScalar       *aa=a->a,*v;
309   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
310   Scalar          *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
314   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315 
316   /* forward solve the U^T */
317   idx = 0;
318   for (i=0; i<n; i++) {
319 
320     v     = aa + 36*diag[i];
321     /* multiply by the inverse of the block diagonal */
322     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
323     x6    = b[5+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
325     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
326     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
327     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
328     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
329     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
330     v += 36;
331 
332     vi    = aj + diag[i] + 1;
333     nz    = ai[i+1] - diag[i] - 1;
334     while (nz--) {
335       oidx = 6*(*vi++);
336       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
337       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
338       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
339       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
340       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
341       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
342       v  += 36;
343     }
344     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
345     x[5+idx] = s6;
346     idx += 6;
347   }
348   /* backward solve the L^T */
349   for (i=n-1; i>=0; i--){
350     v    = aa + 36*diag[i] - 36;
351     vi   = aj + diag[i] - 1;
352     nz   = diag[i] - ai[i];
353     idt  = 6*i;
354     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
355     s6 = x[5+idt];
356     while (nz--) {
357       idx   = 6*(*vi--);
358       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
359       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
360       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
361       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
362       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
363       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
364       v -= 36;
365     }
366   }
367   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
369   PLogFlops(2*36*(a->nz) - 6*a->n);
370   PetscFunctionReturn(0);
371 }
372 
373 #undef __FUNC__
374 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering"
375 int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
376 {
377   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
378   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
379   int             *diag = a->diag,oidx;
380   MatScalar       *aa=a->a,*v;
381   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
382   Scalar          *x,*b;
383 
384   PetscFunctionBegin;
385   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387 
388   /* forward solve the U^T */
389   idx = 0;
390   for (i=0; i<n; i++) {
391 
392     v     = aa + 49*diag[i];
393     /* multiply by the inverse of the block diagonal */
394     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
395     x6    = b[5+idx]; x7 = b[6+idx];
396     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
397     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
398     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
399     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
400     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
401     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
402     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
403     v += 49;
404 
405     vi    = aj + diag[i] + 1;
406     nz    = ai[i+1] - diag[i] - 1;
407     while (nz--) {
408       oidx = 7*(*vi++);
409       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
410       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
411       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
412       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
413       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
414       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
415       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
416       v  += 49;
417     }
418     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
419     x[5+idx] = s6;x[6+idx] = s7;
420     idx += 7;
421   }
422   /* backward solve the L^T */
423   for (i=n-1; i>=0; i--){
424     v    = aa + 49*diag[i] - 49;
425     vi   = aj + diag[i] - 1;
426     nz   = diag[i] - ai[i];
427     idt  = 7*i;
428     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
429     s6 = x[5+idt];s7 = x[6+idt];
430     while (nz--) {
431       idx   = 7*(*vi--);
432       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
433       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
434       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
435       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
436       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
437       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
438       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
439       v -= 49;
440     }
441   }
442   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
444   PLogFlops(2*49*(a->nz) - 7*a->n);
445   PetscFunctionReturn(0);
446 }
447 
448 #undef __FUNC__
449 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1"
450 int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453   IS              iscol=a->col,isrow=a->row;
454   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455   int             *diag = a->diag;
456   MatScalar       *aa=a->a,*v;
457   Scalar          s1,*x,*b,*t;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   t  = a->solve_work;
463 
464   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466 
467   /* copy the b into temp work space according to permutation */
468   for (i=0; i<n; i++) {
469     t[i] = b[c[i]];
470   }
471 
472   /* forward solve the U^T */
473   for (i=0; i<n; i++) {
474 
475     v     = aa + diag[i];
476     /* multiply by the inverse of the block diagonal */
477     s1    = (*v++)*t[i];
478     vi    = aj + diag[i] + 1;
479     nz    = ai[i+1] - diag[i] - 1;
480     while (nz--) {
481       t[*vi++]  -= (*v++)*s1;
482     }
483     t[i]   = s1;
484   }
485   /* backward solve the L^T */
486   for (i=n-1; i>=0; i--){
487     v    = aa + diag[i] - 1;
488     vi   = aj + diag[i] - 1;
489     nz   = diag[i] - ai[i];
490     s1   = t[i];
491     while (nz--) {
492       t[*vi--]   -=  (*v--)*s1;
493     }
494   }
495 
496   /* copy t into x according to permutation */
497   for (i=0; i<n; i++) {
498     x[r[i]]   = t[i];
499   }
500 
501   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505   PLogFlops(2*(a->nz) - a->n);
506   PetscFunctionReturn(0);
507 }
508 
509 #undef __FUNC__
510 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2"
511 int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
512 {
513   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514   IS              iscol=a->col,isrow=a->row;
515   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516   int             *diag = a->diag,ii,ic,ir,oidx;
517   MatScalar       *aa=a->a,*v;
518   Scalar          s1,s2,x1,x2;
519   Scalar          *x,*b,*t;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   t  = a->solve_work;
525 
526   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528 
529   /* copy the b into temp work space according to permutation */
530   ii = 0;
531   for (i=0; i<n; i++) {
532     ic      = 2*c[i];
533     t[ii]   = b[ic];
534     t[ii+1] = b[ic+1];
535     ii += 2;
536   }
537 
538   /* forward solve the U^T */
539   idx = 0;
540   for (i=0; i<n; i++) {
541 
542     v     = aa + 4*diag[i];
543     /* multiply by the inverse of the block diagonal */
544     x1    = t[idx];   x2 = t[1+idx];
545     s1 = v[0]*x1  +  v[1]*x2;
546     s2 = v[2]*x1  +  v[3]*x2;
547     v += 4;
548 
549     vi    = aj + diag[i] + 1;
550     nz    = ai[i+1] - diag[i] - 1;
551     while (nz--) {
552       oidx = 2*(*vi++);
553       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555       v  += 4;
556     }
557     t[idx]   = s1;t[1+idx] = s2;
558     idx += 2;
559   }
560   /* backward solve the L^T */
561   for (i=n-1; i>=0; i--){
562     v    = aa + 4*diag[i] - 4;
563     vi   = aj + diag[i] - 1;
564     nz   = diag[i] - ai[i];
565     idt  = 2*i;
566     s1 = t[idt];  s2 = t[1+idt];
567     while (nz--) {
568       idx   = 2*(*vi--);
569       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571       v -= 4;
572     }
573   }
574 
575   /* copy t into x according to permutation */
576   ii = 0;
577   for (i=0; i<n; i++) {
578     ir      = 2*r[i];
579     x[ir]   = t[ii];
580     x[ir+1] = t[ii+1];
581     ii += 2;
582   }
583 
584   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PLogFlops(2*4*(a->nz) - 2*a->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNC__
593 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3"
594 int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599   int             *diag = a->diag,ii,ic,ir,oidx;
600   MatScalar       *aa=a->a,*v;
601   Scalar          s1,s2,s3,x1,x2,x3;
602   Scalar          *x,*b,*t;
603 
604   PetscFunctionBegin;
605   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607   t  = a->solve_work;
608 
609   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611 
612   /* copy the b into temp work space according to permutation */
613   ii = 0;
614   for (i=0; i<n; i++) {
615     ic      = 3*c[i];
616     t[ii]   = b[ic];
617     t[ii+1] = b[ic+1];
618     t[ii+2] = b[ic+2];
619     ii += 3;
620   }
621 
622   /* forward solve the U^T */
623   idx = 0;
624   for (i=0; i<n; i++) {
625 
626     v     = aa + 9*diag[i];
627     /* multiply by the inverse of the block diagonal */
628     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632     v += 9;
633 
634     vi    = aj + diag[i] + 1;
635     nz    = ai[i+1] - diag[i] - 1;
636     while (nz--) {
637       oidx = 3*(*vi++);
638       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641       v  += 9;
642     }
643     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644     idx += 3;
645   }
646   /* backward solve the L^T */
647   for (i=n-1; i>=0; i--){
648     v    = aa + 9*diag[i] - 9;
649     vi   = aj + diag[i] - 1;
650     nz   = diag[i] - ai[i];
651     idt  = 3*i;
652     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653     while (nz--) {
654       idx   = 3*(*vi--);
655       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v -= 9;
659     }
660   }
661 
662   /* copy t into x according to permutation */
663   ii = 0;
664   for (i=0; i<n; i++) {
665     ir      = 3*r[i];
666     x[ir]   = t[ii];
667     x[ir+1] = t[ii+1];
668     x[ir+2] = t[ii+2];
669     ii += 3;
670   }
671 
672   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   PLogFlops(2*9*(a->nz) - 3*a->n);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNC__
681 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4"
682 int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685   IS              iscol=a->col,isrow=a->row;
686   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687   int             *diag = a->diag,ii,ic,ir,oidx;
688   MatScalar       *aa=a->a,*v;
689   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
690   Scalar          *x,*b,*t;
691 
692   PetscFunctionBegin;
693   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695   t  = a->solve_work;
696 
697   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699 
700   /* copy the b into temp work space according to permutation */
701   ii = 0;
702   for (i=0; i<n; i++) {
703     ic      = 4*c[i];
704     t[ii]   = b[ic];
705     t[ii+1] = b[ic+1];
706     t[ii+2] = b[ic+2];
707     t[ii+3] = b[ic+3];
708     ii += 4;
709   }
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for (i=0; i<n; i++) {
714 
715     v     = aa + 16*diag[i];
716     /* multiply by the inverse of the block diagonal */
717     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722     v += 16;
723 
724     vi    = aj + diag[i] + 1;
725     nz    = ai[i+1] - diag[i] - 1;
726     while (nz--) {
727       oidx = 4*(*vi++);
728       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732       v  += 16;
733     }
734     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735     idx += 4;
736   }
737   /* backward solve the L^T */
738   for (i=n-1; i>=0; i--){
739     v    = aa + 16*diag[i] - 16;
740     vi   = aj + diag[i] - 1;
741     nz   = diag[i] - ai[i];
742     idt  = 4*i;
743     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744     while (nz--) {
745       idx   = 4*(*vi--);
746       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v -= 16;
751     }
752   }
753 
754   /* copy t into x according to permutation */
755   ii = 0;
756   for (i=0; i<n; i++) {
757     ir      = 4*r[i];
758     x[ir]   = t[ii];
759     x[ir+1] = t[ii+1];
760     x[ir+2] = t[ii+2];
761     x[ir+3] = t[ii+3];
762     ii += 4;
763   }
764 
765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   PLogFlops(2*16*(a->nz) - 4*a->n);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNC__
774 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5"
775 int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778   IS              iscol=a->col,isrow=a->row;
779   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780   int             *diag = a->diag,ii,ic,ir,oidx;
781   MatScalar       *aa=a->a,*v;
782   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783   Scalar          *x,*b,*t;
784 
785   PetscFunctionBegin;
786   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788   t  = a->solve_work;
789 
790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792 
793   /* copy the b into temp work space according to permutation */
794   ii = 0;
795   for (i=0; i<n; i++) {
796     ic      = 5*c[i];
797     t[ii]   = b[ic];
798     t[ii+1] = b[ic+1];
799     t[ii+2] = b[ic+2];
800     t[ii+3] = b[ic+3];
801     t[ii+4] = b[ic+4];
802     ii += 5;
803   }
804 
805   /* forward solve the U^T */
806   idx = 0;
807   for (i=0; i<n; i++) {
808 
809     v     = aa + 25*diag[i];
810     /* multiply by the inverse of the block diagonal */
811     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817     v += 25;
818 
819     vi    = aj + diag[i] + 1;
820     nz    = ai[i+1] - diag[i] - 1;
821     while (nz--) {
822       oidx = 5*(*vi++);
823       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828       v  += 25;
829     }
830     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831     idx += 5;
832   }
833   /* backward solve the L^T */
834   for (i=n-1; i>=0; i--){
835     v    = aa + 25*diag[i] - 25;
836     vi   = aj + diag[i] - 1;
837     nz   = diag[i] - ai[i];
838     idt  = 5*i;
839     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840     while (nz--) {
841       idx   = 5*(*vi--);
842       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v -= 25;
848     }
849   }
850 
851   /* copy t into x according to permutation */
852   ii = 0;
853   for (i=0; i<n; i++) {
854     ir      = 5*r[i];
855     x[ir]   = t[ii];
856     x[ir+1] = t[ii+1];
857     x[ir+2] = t[ii+2];
858     x[ir+3] = t[ii+3];
859     x[ir+4] = t[ii+4];
860     ii += 5;
861   }
862 
863   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867   PLogFlops(2*25*(a->nz) - 5*a->n);
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNC__
872 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6"
873 int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
874 {
875   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876   IS              iscol=a->col,isrow=a->row;
877   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878   int             *diag = a->diag,ii,ic,ir,oidx;
879   MatScalar       *aa=a->a,*v;
880   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881   Scalar          *x,*b,*t;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886   t  = a->solve_work;
887 
888   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890 
891   /* copy the b into temp work space according to permutation */
892   ii = 0;
893   for (i=0; i<n; i++) {
894     ic      = 6*c[i];
895     t[ii]   = b[ic];
896     t[ii+1] = b[ic+1];
897     t[ii+2] = b[ic+2];
898     t[ii+3] = b[ic+3];
899     t[ii+4] = b[ic+4];
900     t[ii+5] = b[ic+5];
901     ii += 6;
902   }
903 
904   /* forward solve the U^T */
905   idx = 0;
906   for (i=0; i<n; i++) {
907 
908     v     = aa + 36*diag[i];
909     /* multiply by the inverse of the block diagonal */
910     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911     x6    = t[5+idx];
912     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918     v += 36;
919 
920     vi    = aj + diag[i] + 1;
921     nz    = ai[i+1] - diag[i] - 1;
922     while (nz--) {
923       oidx = 6*(*vi++);
924       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930       v  += 36;
931     }
932     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933     t[5+idx] = s6;
934     idx += 6;
935   }
936   /* backward solve the L^T */
937   for (i=n-1; i>=0; i--){
938     v    = aa + 36*diag[i] - 36;
939     vi   = aj + diag[i] - 1;
940     nz   = diag[i] - ai[i];
941     idt  = 6*i;
942     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943     s6 = t[5+idt];
944     while (nz--) {
945       idx   = 6*(*vi--);
946       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952       v -= 36;
953     }
954   }
955 
956   /* copy t into x according to permutation */
957   ii = 0;
958   for (i=0; i<n; i++) {
959     ir      = 6*r[i];
960     x[ir]   = t[ii];
961     x[ir+1] = t[ii+1];
962     x[ir+2] = t[ii+2];
963     x[ir+3] = t[ii+3];
964     x[ir+4] = t[ii+4];
965     x[ir+5] = t[ii+5];
966     ii += 6;
967   }
968 
969   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973   PLogFlops(2*36*(a->nz) - 6*a->n);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNC__
978 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7"
979 int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982   IS              iscol=a->col,isrow=a->row;
983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984   int             *diag = a->diag,ii,ic,ir,oidx;
985   MatScalar       *aa=a->a,*v;
986   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987   Scalar          *x,*b,*t;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   t  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996 
997   /* copy the b into temp work space according to permutation */
998   ii = 0;
999   for (i=0; i<n; i++) {
1000     ic      = 7*c[i];
1001     t[ii]   = b[ic];
1002     t[ii+1] = b[ic+1];
1003     t[ii+2] = b[ic+2];
1004     t[ii+3] = b[ic+3];
1005     t[ii+4] = b[ic+4];
1006     t[ii+5] = b[ic+5];
1007     t[ii+6] = b[ic+6];
1008     ii += 7;
1009   }
1010 
1011   /* forward solve the U^T */
1012   idx = 0;
1013   for (i=0; i<n; i++) {
1014 
1015     v     = aa + 49*diag[i];
1016     /* multiply by the inverse of the block diagonal */
1017     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018     x6    = t[5+idx]; x7 = t[6+idx];
1019     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026     v += 49;
1027 
1028     vi    = aj + diag[i] + 1;
1029     nz    = ai[i+1] - diag[i] - 1;
1030     while (nz--) {
1031       oidx = 7*(*vi++);
1032       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039       v  += 49;
1040     }
1041     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042     t[5+idx] = s6;t[6+idx] = s7;
1043     idx += 7;
1044   }
1045   /* backward solve the L^T */
1046   for (i=n-1; i>=0; i--){
1047     v    = aa + 49*diag[i] - 49;
1048     vi   = aj + diag[i] - 1;
1049     nz   = diag[i] - ai[i];
1050     idt  = 7*i;
1051     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052     s6 = t[5+idt];s7 = t[6+idt];
1053     while (nz--) {
1054       idx   = 7*(*vi--);
1055       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062       v -= 49;
1063     }
1064   }
1065 
1066   /* copy t into x according to permutation */
1067   ii = 0;
1068   for (i=0; i<n; i++) {
1069     ir      = 7*r[i];
1070     x[ir]   = t[ii];
1071     x[ir+1] = t[ii+1];
1072     x[ir+2] = t[ii+2];
1073     x[ir+3] = t[ii+3];
1074     x[ir+4] = t[ii+4];
1075     x[ir+5] = t[ii+5];
1076     x[ir+6] = t[ii+6];
1077     ii += 7;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084   PLogFlops(2*49*(a->nz) - 7*a->n);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* ----------------------------------------------------------- */
1089 #undef __FUNC__
1090 #define __FUNC__ "MatSolve_SeqSBAIJ_N"
1091 int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
1092 {
1093   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1094   IS              iscol=a->col,isrow=a->row;
1095   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1096   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1097   MatScalar       *aa=a->a,*v;
1098   Scalar          *x,*b,*s,*t,*ls;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103   t  = a->solve_work;
1104 
1105   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1106   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1107 
1108   /* forward solve the lower triangular */
1109   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1110   for (i=1; i<n; i++) {
1111     v   = aa + bs2*ai[i];
1112     vi  = aj + ai[i];
1113     nz  = a->diag[i] - ai[i];
1114     s = t + bs*i;
1115     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1116     while (nz--) {
1117       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1118       v += bs2;
1119     }
1120   }
1121   /* backward solve the upper triangular */
1122   ls = a->solve_work + a->n;
1123   for (i=n-1; i>=0; i--){
1124     v   = aa + bs2*(a->diag[i] + 1);
1125     vi  = aj + a->diag[i] + 1;
1126     nz  = ai[i+1] - a->diag[i] - 1;
1127     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1137   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140   PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatSolve_SeqSBAIJ_7"
1146 int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1149   IS              iscol=a->col,isrow=a->row;
1150   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1151   int             *diag = a->diag;
1152   MatScalar       *aa=a->a,*v;
1153   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154   Scalar          *x,*b,*t;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   t  = a->solve_work;
1160 
1161   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1162   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1163 
1164   /* forward solve the lower triangular */
1165   idx    = 7*(*r++);
1166   t[0] = b[idx];   t[1] = b[1+idx];
1167   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168   t[5] = b[5+idx]; t[6] = b[6+idx];
1169 
1170   for (i=1; i<n; i++) {
1171     v     = aa + 49*ai[i];
1172     vi    = aj + ai[i];
1173     nz    = diag[i] - ai[i];
1174     idx   = 7*(*r++);
1175     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1177     while (nz--) {
1178       idx   = 7*(*vi++);
1179       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180       x4    = t[3+idx];x5 = t[4+idx];
1181       x6    = t[5+idx];x7 = t[6+idx];
1182       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1189       v += 49;
1190     }
1191     idx = 7*i;
1192     t[idx]   = s1;t[1+idx] = s2;
1193     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194     t[5+idx] = s6;t[6+idx] = s7;
1195   }
1196   /* backward solve the upper triangular */
1197   for (i=n-1; i>=0; i--){
1198     v    = aa + 49*diag[i] + 49;
1199     vi   = aj + diag[i] + 1;
1200     nz   = ai[i+1] - diag[i] - 1;
1201     idt  = 7*i;
1202     s1 = t[idt];  s2 = t[1+idt];
1203     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204     s6 = t[5+idt];s7 = t[6+idt];
1205     while (nz--) {
1206       idx   = 7*(*vi++);
1207       x1    = t[idx];   x2 = t[1+idx];
1208       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209       x6    = t[5+idx]; x7 = t[6+idx];
1210       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1217       v += 49;
1218     }
1219     idc = 7*(*c--);
1220     v   = aa + 49*diag[i];
1221     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1235   }
1236 
1237   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1238   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241   PLogFlops(2*49*(a->nz) - 7*a->n);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNC__
1246 #define __FUNC__ "MatSolve_SeqSBAIJ_7_NaturalOrdering"
1247 int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1248 {
1249   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1250   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1251   int             ierr,*diag = a->diag,jdx;
1252   MatScalar       *aa=a->a,*v;
1253   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1258   /* forward solve the lower triangular */
1259   idx    = 0;
1260   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1261   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1262   x[6] = b[6+idx];
1263   for (i=1; i<n; i++) {
1264     v     =  aa + 49*ai[i];
1265     vi    =  aj + ai[i];
1266     nz    =  diag[i] - ai[i];
1267     idx   =  7*i;
1268     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270     s7  =  b[6+idx];
1271     while (nz--) {
1272       jdx   = 7*(*vi++);
1273       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1274       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1275       x7    = x[6+jdx];
1276       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1283       v += 49;
1284      }
1285     x[idx]   = s1;
1286     x[1+idx] = s2;
1287     x[2+idx] = s3;
1288     x[3+idx] = s4;
1289     x[4+idx] = s5;
1290     x[5+idx] = s6;
1291     x[6+idx] = s7;
1292   }
1293   /* backward solve the upper triangular */
1294   for (i=n-1; i>=0; i--){
1295     v    = aa + 49*diag[i] + 49;
1296     vi   = aj + diag[i] + 1;
1297     nz   = ai[i+1] - diag[i] - 1;
1298     idt  = 7*i;
1299     s1 = x[idt];   s2 = x[1+idt];
1300     s3 = x[2+idt]; s4 = x[3+idt];
1301     s5 = x[4+idt]; s6 = x[5+idt];
1302     s7 = x[6+idt];
1303     while (nz--) {
1304       idx   = 7*(*vi++);
1305       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1306       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1307       x7    = x[6+idx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316     }
1317     v        = aa + 49*diag[i];
1318     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1332   }
1333 
1334   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   PLogFlops(2*36*(a->nz) - 6*a->n);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNC__
1341 #define __FUNC__ "MatSolve_SeqSBAIJ_6"
1342 int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
1343 {
1344   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1345   IS              iscol=a->col,isrow=a->row;
1346   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1347   int             *diag = a->diag;
1348   MatScalar       *aa=a->a,*v;
1349   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1358 
1359   /* forward solve the lower triangular */
1360   idx    = 6*(*r++);
1361   t[0] = b[idx];   t[1] = b[1+idx];
1362   t[2] = b[2+idx]; t[3] = b[3+idx];
1363   t[4] = b[4+idx]; t[5] = b[5+idx];
1364   for (i=1; i<n; i++) {
1365     v     = aa + 36*ai[i];
1366     vi    = aj + ai[i];
1367     nz    = diag[i] - ai[i];
1368     idx   = 6*(*r++);
1369     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370     s5  = b[4+idx]; s6 = b[5+idx];
1371     while (nz--) {
1372       idx   = 6*(*vi++);
1373       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1381       v += 36;
1382     }
1383     idx = 6*i;
1384     t[idx]   = s1;t[1+idx] = s2;
1385     t[2+idx] = s3;t[3+idx] = s4;
1386     t[4+idx] = s5;t[5+idx] = s6;
1387   }
1388   /* backward solve the upper triangular */
1389   for (i=n-1; i>=0; i--){
1390     v    = aa + 36*diag[i] + 36;
1391     vi   = aj + diag[i] + 1;
1392     nz   = ai[i+1] - diag[i] - 1;
1393     idt  = 6*i;
1394     s1 = t[idt];  s2 = t[1+idt];
1395     s3 = t[2+idt];s4 = t[3+idt];
1396     s5 = t[4+idt];s6 = t[5+idt];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx];
1400       x3    = t[2+idx]; x4 = t[3+idx];
1401       x5    = t[4+idx]; x6 = t[5+idx];
1402       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1408       v += 36;
1409     }
1410     idc = 6*(*c--);
1411     v   = aa + 36*diag[i];
1412     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423                                  v[23]*s4+v[29]*s5+v[35]*s6;
1424   }
1425 
1426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1428   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430   PLogFlops(2*36*(a->nz) - 6*a->n);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNC__
1435 #define __FUNC__ "MatSolve_SeqSBAIJ_6_NaturalOrdering"
1436 int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1437 {
1438   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1439   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1440   int             ierr,*diag = a->diag,jdx;
1441   MatScalar       *aa=a->a,*v;
1442   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   /* forward solve the lower triangular */
1448   idx    = 0;
1449   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1450   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1451   for (i=1; i<n; i++) {
1452     v     =  aa + 36*ai[i];
1453     vi    =  aj + ai[i];
1454     nz    =  diag[i] - ai[i];
1455     idx   =  6*i;
1456     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1458     while (nz--) {
1459       jdx   = 6*(*vi++);
1460       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1461       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1468       v += 36;
1469      }
1470     x[idx]   = s1;
1471     x[1+idx] = s2;
1472     x[2+idx] = s3;
1473     x[3+idx] = s4;
1474     x[4+idx] = s5;
1475     x[5+idx] = s6;
1476   }
1477   /* backward solve the upper triangular */
1478   for (i=n-1; i>=0; i--){
1479     v    = aa + 36*diag[i] + 36;
1480     vi   = aj + diag[i] + 1;
1481     nz   = ai[i+1] - diag[i] - 1;
1482     idt  = 6*i;
1483     s1 = x[idt];   s2 = x[1+idt];
1484     s3 = x[2+idt]; s4 = x[3+idt];
1485     s5 = x[4+idt]; s6 = x[5+idt];
1486     while (nz--) {
1487       idx   = 6*(*vi++);
1488       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1489       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497     }
1498     v        = aa + 36*diag[i];
1499     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1505   }
1506 
1507   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   PLogFlops(2*36*(a->nz) - 6*a->n);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNC__
1514 #define __FUNC__ "MatSolve_SeqSBAIJ_5"
1515 int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1518   IS              iscol=a->col,isrow=a->row;
1519   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1520   int             *diag = a->diag;
1521   MatScalar       *aa=a->a,*v;
1522   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1523 
1524   PetscFunctionBegin;
1525   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527   t  = a->solve_work;
1528 
1529   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1530   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1531 
1532   /* forward solve the lower triangular */
1533   idx    = 5*(*r++);
1534   t[0] = b[idx];   t[1] = b[1+idx];
1535   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1536   for (i=1; i<n; i++) {
1537     v     = aa + 25*ai[i];
1538     vi    = aj + ai[i];
1539     nz    = diag[i] - ai[i];
1540     idx   = 5*(*r++);
1541     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542     s5  = b[4+idx];
1543     while (nz--) {
1544       idx   = 5*(*vi++);
1545       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546       x4    = t[3+idx];x5 = t[4+idx];
1547       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1552       v += 25;
1553     }
1554     idx = 5*i;
1555     t[idx]   = s1;t[1+idx] = s2;
1556     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1557   }
1558   /* backward solve the upper triangular */
1559   for (i=n-1; i>=0; i--){
1560     v    = aa + 25*diag[i] + 25;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 5*i;
1564     s1 = t[idt];  s2 = t[1+idt];
1565     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1566     while (nz--) {
1567       idx   = 5*(*vi++);
1568       x1    = t[idx];   x2 = t[1+idx];
1569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1575       v += 25;
1576     }
1577     idc = 5*(*c--);
1578     v   = aa + 25*diag[i];
1579     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580                                  v[15]*s4+v[20]*s5;
1581     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582                                  v[16]*s4+v[21]*s5;
1583     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584                                  v[17]*s4+v[22]*s5;
1585     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586                                  v[18]*s4+v[23]*s5;
1587     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588                                  v[19]*s4+v[24]*s5;
1589   }
1590 
1591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   PLogFlops(2*25*(a->nz) - 5*a->n);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNC__
1600 #define __FUNC__ "MatSolve_SeqSBAIJ_5_NaturalOrdering"
1601 int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1604   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1605   int             ierr,*diag = a->diag,jdx;
1606   MatScalar       *aa=a->a,*v;
1607   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1612   /* forward solve the lower triangular */
1613   idx    = 0;
1614   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1615   for (i=1; i<n; i++) {
1616     v     =  aa + 25*ai[i];
1617     vi    =  aj + ai[i];
1618     nz    =  diag[i] - ai[i];
1619     idx   =  5*i;
1620     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1621     while (nz--) {
1622       jdx   = 5*(*vi++);
1623       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1629       v    += 25;
1630     }
1631     x[idx]   = s1;
1632     x[1+idx] = s2;
1633     x[2+idx] = s3;
1634     x[3+idx] = s4;
1635     x[4+idx] = s5;
1636   }
1637   /* backward solve the upper triangular */
1638   for (i=n-1; i>=0; i--){
1639     v    = aa + 25*diag[i] + 25;
1640     vi   = aj + diag[i] + 1;
1641     nz   = ai[i+1] - diag[i] - 1;
1642     idt  = 5*i;
1643     s1 = x[idt];  s2 = x[1+idt];
1644     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1645     while (nz--) {
1646       idx   = 5*(*vi++);
1647       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1653       v    += 25;
1654     }
1655     v        = aa + 25*diag[i];
1656     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1661   }
1662 
1663   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665   PLogFlops(2*25*(a->nz) - 5*a->n);
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 #undef __FUNC__
1670 #define __FUNC__ "MatSolve_SeqSBAIJ_4"
1671 int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
1672 {
1673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1674   IS              iscol=a->col,isrow=a->row;
1675   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1676   int             *diag = a->diag;
1677   MatScalar       *aa=a->a,*v;
1678   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1679 
1680   PetscFunctionBegin;
1681   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683   t  = a->solve_work;
1684 
1685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1687 
1688   /* forward solve the lower triangular */
1689   idx    = 4*(*r++);
1690   t[0] = b[idx];   t[1] = b[1+idx];
1691   t[2] = b[2+idx]; t[3] = b[3+idx];
1692   for (i=1; i<n; i++) {
1693     v     = aa + 16*ai[i];
1694     vi    = aj + ai[i];
1695     nz    = diag[i] - ai[i];
1696     idx   = 4*(*r++);
1697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1698     while (nz--) {
1699       idx   = 4*(*vi++);
1700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1705       v    += 16;
1706     }
1707     idx        = 4*i;
1708     t[idx]   = s1;t[1+idx] = s2;
1709     t[2+idx] = s3;t[3+idx] = s4;
1710   }
1711   /* backward solve the upper triangular */
1712   for (i=n-1; i>=0; i--){
1713     v    = aa + 16*diag[i] + 16;
1714     vi   = aj + diag[i] + 1;
1715     nz   = ai[i+1] - diag[i] - 1;
1716     idt  = 4*i;
1717     s1 = t[idt];  s2 = t[1+idt];
1718     s3 = t[2+idt];s4 = t[3+idt];
1719     while (nz--) {
1720       idx   = 4*(*vi++);
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1727       v += 16;
1728     }
1729     idc      = 4*(*c--);
1730     v        = aa + 16*diag[i];
1731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   PLogFlops(2*16*(a->nz) - 4*a->n);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 
1746 /*
1747       Special case where the matrix was ILU(0) factored in the natural
1748    ordering. This eliminates the need for the column and row permutation.
1749 */
1750 #undef __FUNC__
1751 #define __FUNC__ "MatSolve_SeqSBAIJ_4_NaturalOrdering"
1752 int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1753 {
1754   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1755   int             n=a->mbs,*ai=a->i,*aj=a->j;
1756   int             ierr,*diag = a->diag;
1757   MatScalar       *aa=a->a;
1758   Scalar          *x,*b;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763 
1764 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1765   {
1766     static Scalar w[2000]; /* very BAD need to fix */
1767     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1768   }
1769 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1770   {
1771     static Scalar w[2000]; /* very BAD need to fix */
1772     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1773   }
1774 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1775   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1776 #else
1777   {
1778     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1779     MatScalar *v;
1780     int       jdx,idt,idx,nz,*vi,i,ai16;
1781 
1782   /* forward solve the lower triangular */
1783   idx    = 0;
1784   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1785   for (i=1; i<n; i++) {
1786     v     =  aa      + 16*ai[i];
1787     vi    =  aj      + ai[i];
1788     nz    =  diag[i] - ai[i];
1789     idx   +=  4;
1790     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1791     while (nz--) {
1792       jdx   = 4*(*vi++);
1793       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1794       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1795       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1796       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1797       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1798       v    += 16;
1799     }
1800     x[idx]   = s1;
1801     x[1+idx] = s2;
1802     x[2+idx] = s3;
1803     x[3+idx] = s4;
1804   }
1805   /* backward solve the upper triangular */
1806   idt = 4*(n-1);
1807   for (i=n-1; i>=0; i--){
1808     ai16 = 16*diag[i];
1809     v    = aa + ai16 + 16;
1810     vi   = aj + diag[i] + 1;
1811     nz   = ai[i+1] - diag[i] - 1;
1812     s1 = x[idt];  s2 = x[1+idt];
1813     s3 = x[2+idt];s4 = x[3+idt];
1814     while (nz--) {
1815       idx   = 4*(*vi++);
1816       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1817       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1818       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1819       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1820       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1821       v    += 16;
1822     }
1823     v        = aa + ai16;
1824     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1825     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1826     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1827     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1828     idt -= 4;
1829   }
1830   }
1831 #endif
1832 
1833   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1835   PLogFlops(2*16*(a->nz) - 4*a->n);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 #undef __FUNC__
1840 #define __FUNC__ "MatSolve_SeqSBAIJ_3"
1841 int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
1842 {
1843   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1844   IS              iscol=a->col,isrow=a->row;
1845   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1846   int             *diag = a->diag;
1847   MatScalar       *aa=a->a,*v;
1848   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
1849 
1850   PetscFunctionBegin;
1851   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1852   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1853   t  = a->solve_work;
1854 
1855   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1856   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1857 
1858   /* forward solve the lower triangular */
1859   idx    = 3*(*r++);
1860   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1861   for (i=1; i<n; i++) {
1862     v     = aa + 9*ai[i];
1863     vi    = aj + ai[i];
1864     nz    = diag[i] - ai[i];
1865     idx   = 3*(*r++);
1866     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1867     while (nz--) {
1868       idx   = 3*(*vi++);
1869       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1870       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1871       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1872       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1873       v += 9;
1874     }
1875     idx = 3*i;
1876     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1877   }
1878   /* backward solve the upper triangular */
1879   for (i=n-1; i>=0; i--){
1880     v    = aa + 9*diag[i] + 9;
1881     vi   = aj + diag[i] + 1;
1882     nz   = ai[i+1] - diag[i] - 1;
1883     idt  = 3*i;
1884     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1885     while (nz--) {
1886       idx   = 3*(*vi++);
1887       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1888       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1889       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1890       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1891       v += 9;
1892     }
1893     idc = 3*(*c--);
1894     v   = aa + 9*diag[i];
1895     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1896     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1897     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1898   }
1899   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1900   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1901   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1903   PLogFlops(2*9*(a->nz) - 3*a->n);
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 /*
1908       Special case where the matrix was ILU(0) factored in the natural
1909    ordering. This eliminates the need for the column and row permutation.
1910 */
1911 #undef __FUNC__
1912 #define __FUNC__ "MatSolve_SeqSBAIJ_3_NaturalOrdering"
1913 int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1914 {
1915   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1916   int             n=a->mbs,*ai=a->i,*aj=a->j;
1917   int             ierr,*diag = a->diag;
1918   MatScalar       *aa=a->a,*v;
1919   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
1920   int             jdx,idt,idx,nz,*vi,i;
1921 
1922   PetscFunctionBegin;
1923   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1924   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1925 
1926 
1927   /* forward solve the lower triangular */
1928   idx    = 0;
1929   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
1930   for (i=1; i<n; i++) {
1931     v     =  aa      + 9*ai[i];
1932     vi    =  aj      + ai[i];
1933     nz    =  diag[i] - ai[i];
1934     idx   +=  3;
1935     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1936     while (nz--) {
1937       jdx   = 3*(*vi++);
1938       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1939       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1940       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1941       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1942       v    += 9;
1943     }
1944     x[idx]   = s1;
1945     x[1+idx] = s2;
1946     x[2+idx] = s3;
1947   }
1948   /* backward solve the upper triangular */
1949   for (i=n-1; i>=0; i--){
1950     v    = aa + 9*diag[i] + 9;
1951     vi   = aj + diag[i] + 1;
1952     nz   = ai[i+1] - diag[i] - 1;
1953     idt  = 3*i;
1954     s1 = x[idt];  s2 = x[1+idt];
1955     s3 = x[2+idt];
1956     while (nz--) {
1957       idx   = 3*(*vi++);
1958       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1959       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1960       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1961       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1962       v    += 9;
1963     }
1964     v        = aa +  9*diag[i];
1965     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1966     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1967     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1968   }
1969 
1970   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1972   PLogFlops(2*9*(a->nz) - 3*a->n);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 #undef __FUNC__
1977 #define __FUNC__ "MatSolve_SeqSBAIJ_2"
1978 int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1979 {
1980   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1981   IS              iscol=a->col,isrow=a->row;
1982   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1983   int             *diag = a->diag;
1984   MatScalar       *aa=a->a,*v;
1985   Scalar          *x,*b,s1,s2,x1,x2,*t;
1986 
1987   PetscFunctionBegin;
1988   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1990   t  = a->solve_work;
1991 
1992   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1993   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1994 
1995   /* forward solve the lower triangular */
1996   idx    = 2*(*r++);
1997   t[0] = b[idx]; t[1] = b[1+idx];
1998   for (i=1; i<n; i++) {
1999     v     = aa + 4*ai[i];
2000     vi    = aj + ai[i];
2001     nz    = diag[i] - ai[i];
2002     idx   = 2*(*r++);
2003     s1  = b[idx]; s2 = b[1+idx];
2004     while (nz--) {
2005       idx   = 2*(*vi++);
2006       x1    = t[idx]; x2 = t[1+idx];
2007       s1 -= v[0]*x1 + v[2]*x2;
2008       s2 -= v[1]*x1 + v[3]*x2;
2009       v += 4;
2010     }
2011     idx = 2*i;
2012     t[idx] = s1; t[1+idx] = s2;
2013   }
2014   /* backward solve the upper triangular */
2015   for (i=n-1; i>=0; i--){
2016     v    = aa + 4*diag[i] + 4;
2017     vi   = aj + diag[i] + 1;
2018     nz   = ai[i+1] - diag[i] - 1;
2019     idt  = 2*i;
2020     s1 = t[idt]; s2 = t[1+idt];
2021     while (nz--) {
2022       idx   = 2*(*vi++);
2023       x1    = t[idx]; x2 = t[1+idx];
2024       s1 -= v[0]*x1 + v[2]*x2;
2025       s2 -= v[1]*x1 + v[3]*x2;
2026       v += 4;
2027     }
2028     idc = 2*(*c--);
2029     v   = aa + 4*diag[i];
2030     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2031     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2032   }
2033   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2034   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2035   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2037   PLogFlops(2*4*(a->nz) - 2*a->n);
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 /*
2042       Special case where the matrix was ILU(0) factored in the natural
2043    ordering. This eliminates the need for the column and row permutation.
2044 */
2045 #undef __FUNC__
2046 #define __FUNC__ "MatSolve_SeqSBAIJ_2_NaturalOrdering"
2047 int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2048 {
2049   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2050   int             n=a->mbs,*ai=a->i,*aj=a->j;
2051   int             ierr,*diag = a->diag;
2052   MatScalar       *aa=a->a,*v;
2053   Scalar          *x,*b,s1,s2,x1,x2;
2054   int             jdx,idt,idx,nz,*vi,i;
2055 
2056   PetscFunctionBegin;
2057   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2058   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2059 
2060   /* forward solve the lower triangular */
2061   idx    = 0;
2062   x[0]   = b[0]; x[1] = b[1];
2063   for (i=1; i<n; i++) {
2064     v     =  aa      + 4*ai[i];
2065     vi    =  aj      + ai[i];
2066     nz    =  diag[i] - ai[i];
2067     idx   +=  2;
2068     s1  =  b[idx];s2 = b[1+idx];
2069     while (nz--) {
2070       jdx   = 2*(*vi++);
2071       x1    = x[jdx];x2 = x[1+jdx];
2072       s1 -= v[0]*x1 + v[2]*x2;
2073       s2 -= v[1]*x1 + v[3]*x2;
2074       v    += 4;
2075     }
2076     x[idx]   = s1;
2077     x[1+idx] = s2;
2078   }
2079   /* backward solve the upper triangular */
2080   for (i=n-1; i>=0; i--){
2081     v    = aa + 4*diag[i] + 4;
2082     vi   = aj + diag[i] + 1;
2083     nz   = ai[i+1] - diag[i] - 1;
2084     idt  = 2*i;
2085     s1 = x[idt];  s2 = x[1+idt];
2086     while (nz--) {
2087       idx   = 2*(*vi++);
2088       x1    = x[idx];   x2 = x[1+idx];
2089       s1 -= v[0]*x1 + v[2]*x2;
2090       s2 -= v[1]*x1 + v[3]*x2;
2091       v    += 4;
2092     }
2093     v        = aa +  4*diag[i];
2094     x[idt]   = v[0]*s1 + v[2]*s2;
2095     x[1+idt] = v[1]*s1 + v[3]*s2;
2096   }
2097 
2098   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2099   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2100   PLogFlops(2*4*(a->nz) - 2*a->n);
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 /*----------- New 1 --------------*/
2105 #undef __FUNC__
2106 #define __FUNC__ "MatSolve_SeqSBAIJ_1"
2107 int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
2108 {
2109   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
2110   IS              isrow=a->row;
2111   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
2112   MatScalar       *aa=a->a,*v;
2113   Scalar          *x,*b,xk,*xtmp;
2114   int             nz,*vj,k;
2115 
2116   PetscFunctionBegin;
2117   if (!mbs) PetscFunctionReturn(0);
2118 
2119   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2120   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2121   xtmp = a->solve_work;
2122 
2123   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
2124 
2125   /* solve U'*D*y = b by forward substitution */
2126   for (k=0; k<mbs; k++) xtmp[k] = b[rip[k]];
2127   for (k=0; k<mbs; k++){
2128     v  = aa + ai[k];
2129     vj = aj + ai[k];
2130     xk = xtmp[k];
2131     nz = ai[k+1] - ai[k];
2132     while (nz--) xtmp[*vj++] += (*v++) * xk;
2133     xtmp[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
2134   }
2135 
2136   /* solve U*x = y by back substitution */
2137 
2138   for (k=mbs-1; k>=0; k--){
2139     v  = aa + ai[k];
2140     vj = aj + ai[k];
2141     xk = xtmp[k];
2142     nz = ai[k+1] - ai[k];
2143     while (nz--) xk += (*v++) * xtmp[*vj++];
2144     xtmp[k] = xk;
2145     x[rip[k]] = xk;
2146   }
2147 
2148   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
2149   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2150   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2151   PLogFlops(4*a->s_nz + a->m);
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 
2156 #ifdef CONT
2157   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2158   IS              iscol=a->col,isrow=a->row;
2159   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2160   int             *diag = a->diag;
2161   MatScalar       *aa=a->a,*v;
2162   Scalar          *x,*b,s1,*t;
2163 
2164   PetscFunctionBegin;
2165   if (!n) PetscFunctionReturn(0);
2166 
2167   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2168   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2169   t  = a->solve_work;
2170 
2171   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2172   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2173 
2174   /* forward solve the lower triangular */
2175   t[0] = b[*r++];
2176   for (i=1; i<n; i++) {
2177     v     = aa + ai[i];
2178     vi    = aj + ai[i];
2179     nz    = diag[i] - ai[i];
2180     s1  = b[*r++];
2181     while (nz--) {
2182       s1 -= (*v++)*t[*vi++];
2183     }
2184     t[i] = s1;
2185   }
2186   /* backward solve the upper triangular */
2187   for (i=n-1; i>=0; i--){
2188     v    = aa + diag[i] + 1;
2189     vi   = aj + diag[i] + 1;
2190     nz   = ai[i+1] - diag[i] - 1;
2191     s1 = t[i];
2192     while (nz--) {
2193       s1 -= (*v++)*t[*vi++];
2194     }
2195     x[*c--] = t[i] = aa[diag[i]]*s1;
2196   }
2197 
2198   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2199   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2200   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2201   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2202   PLogFlops(2*1*(a->nz) - a->n);
2203   PetscFunctionReturn(0);
2204 }
2205 
2206 #endif
2207 /*----------- New 1 --------------*/
2208 /*
2209       Special case where the matrix was ILU(0) factored in the natural
2210    ordering. This eliminates the need for the column and row permutation.
2211 */
2212 #undef __FUNC__
2213 #define __FUNC__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
2214 int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2215 {
2216   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
2217   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
2218   MatScalar       *aa=a->a,*v;
2219   Scalar          *x,*b,xk;
2220   int             nz,*vj,k;
2221 
2222   PetscFunctionBegin;
2223   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2224   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2225 
2226   /* solve U'*D*y = b by forward substitution */
2227   for (k=0; k<mbs; k++) x[k] = b[k];
2228   for (k=0; k<mbs; k++){
2229     v  = aa + ai[k];
2230     vj = aj + ai[k];
2231     xk = x[k];
2232     nz = ai[k+1] - ai[k];
2233     while (nz--) x[*vj++] += (*v++) * xk;
2234     x[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
2235   }
2236 
2237   /* solve U*x = y by back substitution */
2238   for (k=mbs-2; k>=0; k--){
2239     v  = aa + ai[k];
2240     vj = aj + ai[k];
2241     xk = x[k];
2242     nz = ai[k+1] - ai[k];
2243     while (nz--) xk += (*v++) * x[*vj++];
2244     x[k] = xk;
2245   }
2246 
2247   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2249   PLogFlops(4*a->s_nz + a->m);
2250   PetscFunctionReturn(0);
2251 }
2252 
2253 /* ----------------------------------------------------------------*/
2254 #undef __FUNC__
2255 #define __FUNC__ "MatILUFactorSymbolic_SeqSBAIJ"
2256 int MatILUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *B)
2257 {
2258   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2259   IS          isicol;
2260   int         *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
2261   int         *jutmp,bs = a->bs,bs2=a->bs2;
2262   int         m,nzi,realloc = 0,*levtmp;
2263   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2264   int         levels,incrlev,*lev,lev_ik,shift;
2265   PetscReal  f;
2266 
2267   PetscFunctionBegin;
2268   if (info) {
2269     f             = info->fill;
2270     levels        = (int)info->levels;
2271   } else {
2272     f             = 1.0;
2273     levels        = 0;
2274   }
2275   PetscValidHeaderSpecific(isrow,IS_COOKIE);
2276   PetscValidHeaderSpecific(iscol,IS_COOKIE);
2277   /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/
2278   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2279   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
2280   ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr);
2281   for (k=0; k<mbs; k++) {
2282     if ( rip[k] - riip[k] != 0 ) {
2283       printf("Non-symm. permutation, use symm. permutation or general matrix format\n");
2284       break;
2285     }
2286   }
2287 
2288   /* initialization */
2289   /* Don't know how many column pointers are needed so estimate.
2290      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
2291 
2292   umax = (int)(f*ai[mbs] + 1);
2293   lev = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(lev);
2294   umax += mbs + 1;
2295   shift = mbs + 1;
2296   ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
2297   iu[0] = mbs+1;
2298   juptr = mbs;
2299   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2300   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
2301   levtmp =  (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(levtmp);
2302   for (i=0; i<mbs; i++){
2303     jl[i] = mbs; q[i] = 0;
2304   }
2305 
2306   /* for each row k */
2307   for (k=0; k<mbs; k++){
2308     nzk = 0;
2309     q[k] = mbs;
2310     /* initialize nonzero structure of k-th row to row rip[k] of A */
2311     jmin = ai[rip[k]];
2312     jmax = ai[rip[k]+1];
2313     for (j=jmin; j<jmax; j++){
2314       vj = riip[aj[j]];
2315       if(vj > k){
2316         qm = k;
2317         do {
2318           m  = qm; qm = q[m];
2319         } while(qm < vj);
2320         if (qm == vj) {
2321           printf(" error: duplicate entry in A\n"); break;
2322         }
2323         nzk++;
2324         q[m]   = vj;
2325         q[vj]  = qm;
2326         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2327       } /* if(vj > k) */
2328     } /* for (j=jmin; j<jmax; j++) */
2329 
2330     /* modify nonzero structure of k-th row by computing fill-in
2331        for each row i to be merged in */
2332     i = k;
2333     i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */
2334     /* printf(" next pivot row i=%d\n",i); */
2335     while (i < mbs){
2336       /* merge row i into k-th row */
2337       j=iu[i];
2338       vj = ju[j];
2339       lev_ik = lev[j-shift];
2340       nzi = iu[i+1] - (iu[i]+1);
2341       jmin = iu[i] + 1; jmax = iu[i] + nzi;
2342       qm = k;
2343       for (j=jmin; j<jmax+1; j++){
2344         vj = ju[j];
2345         incrlev = lev[j-shift]+lev_ik+1;
2346 
2347         if (incrlev > levels) continue;
2348         do {
2349           m = qm; qm = q[m];
2350         } while (qm < vj);
2351         if (qm != vj){      /* a fill */
2352           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2353           levtmp[vj] = incrlev;
2354         }
2355         else {              /* already a nonzero element */
2356           if (levtmp[vj]>incrlev) levtmp[vj] = incrlev;
2357         }
2358       }
2359       i = jl[i]; /* next pivot row */
2360     }
2361 
2362     /* add k to row list for first nonzero element in k-th row */
2363     if (nzk > 1){
2364       i = q[k]; /* col value of first nonzero element in k_th row of U */
2365       jl[k] = jl[i]; jl[i] = k;
2366     }
2367     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
2368 
2369     /* allocate more space to ju and lev if needed */
2370     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
2371       /* estimate how much additional space we will need */
2372       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2373       /* just double the memory each time */
2374       maxadd = umax;
2375       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2376       umax += maxadd;
2377 
2378       /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */
2379       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
2380       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
2381       ierr = PetscFree(ju);CHKERRQ(ierr);
2382       ju = iu = jutmp;
2383 
2384       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
2385       ierr  = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));CHKERRQ(ierr);
2386       ierr = PetscFree(lev);CHKERRQ(ierr);
2387       lev = jutmp;
2388 
2389       realloc += 2; /* count how many times we realloc */
2390     }
2391 
2392     /* save nonzero structure of k-th row in ju */
2393     i=k;
2394     jumin = juptr + 1; juptr += nzk;
2395     for (j=jumin; j<juptr+1; j++){
2396       i      = q[i];
2397       ju[j]  = i;
2398       lev[j-shift] = levtmp[i];
2399       /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/
2400     }
2401     /* printf("\n");  */
2402   } /* for (k=0; k<mbs; k++) */
2403 
2404   if (ai[mbs] != 0) {
2405     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2406     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2407     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
2408     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
2409     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n");
2410   } else {
2411      PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
2412   }
2413 
2414   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
2415   ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr);
2416 
2417   ierr = PetscFree(q);CHKERRQ(ierr);
2418   ierr = PetscFree(jl);CHKERRQ(ierr);
2419 
2420   /* put together the new matrix */
2421   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
2422   PLogObjectParent(*B,isicol);
2423   b = (Mat_SeqSBAIJ*)(*B)->data;
2424   ierr = PetscFree(b->imax);CHKERRQ(ierr);
2425   b->singlemalloc = PETSC_FALSE;
2426   /* the next line frees the default space generated by the Create() */
2427   ierr = PetscFree(b->a);CHKERRQ(ierr);
2428   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2429   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
2430   b->j          = ju;
2431   b->i          = iu;
2432   b->diag       = 0;
2433   b->ilen       = 0;
2434   b->imax       = 0;
2435   b->row        = isrow;
2436   b->col        = iscol;
2437   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2438   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2439   b->icol       = isicol;
2440   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2441   /* In b structure:  Free imax, ilen, old a, old j.
2442      Allocate idnew, solve_work, new a, new j */
2443   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
2444   b->s_maxnz = b->s_nz = iu[mbs];
2445 
2446   (*B)->factor                 = FACTOR_LU;
2447   (*B)->info.factor_mallocs    = realloc;
2448   (*B)->info.fill_ratio_given  = f;
2449   if (ai[mbs] != 0) {
2450     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2451   } else {
2452     (*B)->info.fill_ratio_needed = 0.0;
2453   }
2454 
2455   PetscFunctionReturn(0);
2456 }
2457 
2458 
2459 
2460