xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision f1af5d2ffeae1f5fc391a89939f4818e47770ae3)
1 /*$Id: baijfact2.c,v 1.30 1999/10/24 14:02:28 bsmith Exp bsmith $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatSolveTrans_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTrans_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   Scalar          s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for ( i=0; i<n; i++ ) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*b[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for ( i=n-1; i>=0; i-- ){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PLogFlops(2*(a->nz) - a->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNC__
55 #define __FUNC__ "MatSolveTrans_SeqBAIJ_2_NaturalOrdering"
56 int MatSolveTrans_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60   int             *diag = a->diag,oidx;
61   MatScalar       *aa=a->a,*v;
62   Scalar          s1,s2,x1,x2;
63   Scalar          *x,*b;
64 
65   PetscFunctionBegin;
66   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68 
69   /* forward solve the U^T */
70   idx = 0;
71   for ( i=0; i<n; i++ ) {
72 
73     v     = aa + 4*diag[i];
74     /* multiply by the inverse of the block diagonal */
75     x1 = b[idx];   x2 = b[1+idx];
76     s1 = v[0]*x1  +  v[1]*x2;
77     s2 = v[2]*x1  +  v[3]*x2;
78     v += 4;
79 
80     vi    = aj + diag[i] + 1;
81     nz    = ai[i+1] - diag[i] - 1;
82     while (nz--) {
83       oidx = 2*(*vi++);
84       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86       v  += 4;
87     }
88     x[idx]   = s1;x[1+idx] = s2;
89     idx += 2;
90   }
91   /* backward solve the L^T */
92   for ( i=n-1; i>=0; i-- ){
93     v    = aa + 4*diag[i] - 4;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     idt  = 2*i;
97     s1   = x[idt];  s2 = x[1+idt];
98     while (nz--) {
99       idx   = 2*(*vi--);
100       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102       v -= 4;
103     }
104   }
105   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107   PLogFlops(2*4*(a->nz) - 2*a->n);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNC__
112 #define __FUNC__ "MatSolveTrans_SeqBAIJ_3_NaturalOrdering"
113 int MatSolveTrans_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114 {
115   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117   int             *diag = a->diag,oidx;
118   MatScalar       *aa=a->a,*v;
119   Scalar          s1,s2,s3,x1,x2,x3;
120   Scalar          *x,*b;
121 
122   PetscFunctionBegin;
123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125 
126   /* forward solve the U^T */
127   idx = 0;
128   for ( i=0; i<n; i++ ) {
129 
130     v     = aa + 9*diag[i];
131     /* multiply by the inverse of the block diagonal */
132     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
133     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
134     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
135     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
136     v += 9;
137 
138     vi    = aj + diag[i] + 1;
139     nz    = ai[i+1] - diag[i] - 1;
140     while (nz--) {
141       oidx = 3*(*vi++);
142       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
143       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
144       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
145       v  += 9;
146     }
147     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
148     idx += 3;
149   }
150   /* backward solve the L^T */
151   for ( i=n-1; i>=0; i-- ){
152     v    = aa + 9*diag[i] - 9;
153     vi   = aj + diag[i] - 1;
154     nz   = diag[i] - ai[i];
155     idt  = 3*i;
156     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
157     while (nz--) {
158       idx   = 3*(*vi--);
159       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
160       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
161       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
162       v -= 9;
163     }
164   }
165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
167   PLogFlops(2*9*(a->nz) - 3*a->n);
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNC__
172 #define __FUNC__ "MatSolveTrans_SeqBAIJ_4_NaturalOrdering"
173 int MatSolveTrans_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
176   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
177   int             *diag = a->diag,oidx;
178   MatScalar       *aa=a->a,*v;
179   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
180   Scalar          *x,*b;
181 
182   PetscFunctionBegin;
183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185 
186   /* forward solve the U^T */
187   idx = 0;
188   for ( i=0; i<n; i++ ) {
189 
190     v     = aa + 16*diag[i];
191     /* multiply by the inverse of the block diagonal */
192     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
193     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
194     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
195     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
196     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
197     v += 16;
198 
199     vi    = aj + diag[i] + 1;
200     nz    = ai[i+1] - diag[i] - 1;
201     while (nz--) {
202       oidx = 4*(*vi++);
203       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
204       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
205       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
206       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
207       v  += 16;
208     }
209     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
210     idx += 4;
211   }
212   /* backward solve the L^T */
213   for ( i=n-1; i>=0; i-- ){
214     v    = aa + 16*diag[i] - 16;
215     vi   = aj + diag[i] - 1;
216     nz   = diag[i] - ai[i];
217     idt  = 4*i;
218     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
219     while (nz--) {
220       idx   = 4*(*vi--);
221       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
222       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
223       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
224       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
225       v -= 16;
226     }
227   }
228   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
229   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
230   PLogFlops(2*16*(a->nz) - 4*a->n);
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNC__
235 #define __FUNC__ "MatSolveTrans_SeqBAIJ_5_NaturalOrdering"
236 int MatSolveTrans_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
237 {
238   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
239   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
240   int             *diag = a->diag,oidx;
241   MatScalar       *aa=a->a,*v;
242   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
243   Scalar          *x,*b;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248 
249   /* forward solve the U^T */
250   idx = 0;
251   for ( i=0; i<n; i++ ) {
252 
253     v     = aa + 25*diag[i];
254     /* multiply by the inverse of the block diagonal */
255     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
256     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
257     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
258     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
259     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
260     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
261     v += 25;
262 
263     vi    = aj + diag[i] + 1;
264     nz    = ai[i+1] - diag[i] - 1;
265     while (nz--) {
266       oidx = 5*(*vi++);
267       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
268       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
269       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
270       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
271       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
272       v  += 25;
273     }
274     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
275     idx += 5;
276   }
277   /* backward solve the L^T */
278   for ( i=n-1; i>=0; i-- ){
279     v    = aa + 25*diag[i] - 25;
280     vi   = aj + diag[i] - 1;
281     nz   = diag[i] - ai[i];
282     idt  = 5*i;
283     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
284     while (nz--) {
285       idx   = 5*(*vi--);
286       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
287       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
288       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
289       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
290       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
291       v -= 25;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   PLogFlops(2*25*(a->nz) - 5*a->n);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNC__
301 #define __FUNC__ "MatSolveTrans_SeqBAIJ_6_NaturalOrdering"
302 int MatSolveTrans_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
305   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306   int             *diag = a->diag,oidx;
307   MatScalar       *aa=a->a,*v;
308   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
309   Scalar          *x,*b;
310 
311   PetscFunctionBegin;
312   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
313   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
314 
315   /* forward solve the U^T */
316   idx = 0;
317   for ( i=0; i<n; i++ ) {
318 
319     v     = aa + 36*diag[i];
320     /* multiply by the inverse of the block diagonal */
321     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
322     x6    = b[5+idx];
323     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
324     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
325     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
326     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
327     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
328     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
329     v += 36;
330 
331     vi    = aj + diag[i] + 1;
332     nz    = ai[i+1] - diag[i] - 1;
333     while (nz--) {
334       oidx = 6*(*vi++);
335       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
336       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
337       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
338       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
339       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
340       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
341       v  += 36;
342     }
343     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
344     x[5+idx] = s6;
345     idx += 6;
346   }
347   /* backward solve the L^T */
348   for ( i=n-1; i>=0; i-- ){
349     v    = aa + 36*diag[i] - 36;
350     vi   = aj + diag[i] - 1;
351     nz   = diag[i] - ai[i];
352     idt  = 6*i;
353     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
354     s6 = x[5+idt];
355     while (nz--) {
356       idx   = 6*(*vi--);
357       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
358       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
359       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
360       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
361       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
362       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
363       v -= 36;
364     }
365   }
366   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
368   PLogFlops(2*36*(a->nz) - 6*a->n);
369   PetscFunctionReturn(0);
370 }
371 
372 #undef __FUNC__
373 #define __FUNC__ "MatSolveTrans_SeqBAIJ_7_NaturalOrdering"
374 int MatSolveTrans_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
375 {
376   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
377   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378   int             *diag = a->diag,oidx;
379   MatScalar       *aa=a->a,*v;
380   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
381   Scalar          *x,*b;
382 
383   PetscFunctionBegin;
384   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386 
387   /* forward solve the U^T */
388   idx = 0;
389   for ( i=0; i<n; i++ ) {
390 
391     v     = aa + 49*diag[i];
392     /* multiply by the inverse of the block diagonal */
393     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
394     x6    = b[5+idx]; x7 = b[6+idx];
395     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
396     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
397     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
398     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
399     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
400     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
401     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
402     v += 49;
403 
404     vi    = aj + diag[i] + 1;
405     nz    = ai[i+1] - diag[i] - 1;
406     while (nz--) {
407       oidx = 7*(*vi++);
408       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
409       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
410       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
411       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
412       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
413       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
414       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
415       v  += 49;
416     }
417     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
418     x[5+idx] = s6;x[6+idx] = s7;
419     idx += 7;
420   }
421   /* backward solve the L^T */
422   for ( i=n-1; i>=0; i-- ){
423     v    = aa + 49*diag[i] - 49;
424     vi   = aj + diag[i] - 1;
425     nz   = diag[i] - ai[i];
426     idt  = 7*i;
427     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
428     s6 = x[5+idt];s7 = x[6+idt];
429     while (nz--) {
430       idx   = 7*(*vi--);
431       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
432       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
433       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
434       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
435       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
436       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
437       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
438       v -= 49;
439     }
440   }
441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443   PLogFlops(2*49*(a->nz) - 7*a->n);
444   PetscFunctionReturn(0);
445 }
446 
447 /*---------------------------------------------------------------------------------------------*/
448 #undef __FUNC__
449 #define __FUNC__ "MatSolveTrans_SeqBAIJ_1"
450 int MatSolveTrans_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453   IS              iscol=a->col,isrow=a->row;
454   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455   int             *diag = a->diag;
456   MatScalar       *aa=a->a,*v;
457   Scalar          s1,*x,*b,*t;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   t  = a->solve_work;
463 
464   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466 
467   /* copy the b into temp work space according to permutation */
468   for ( i=0; i<n; i++ ) {
469     t[i] = b[c[i]];
470   }
471 
472   /* forward solve the U^T */
473   for ( i=0; i<n; i++ ) {
474 
475     v     = aa + diag[i];
476     /* multiply by the inverse of the block diagonal */
477     s1    = (*v++)*t[i];
478     vi    = aj + diag[i] + 1;
479     nz    = ai[i+1] - diag[i] - 1;
480     while (nz--) {
481       t[*vi++]  -= (*v++)*s1;
482     }
483     t[i]   = s1;
484   }
485   /* backward solve the L^T */
486   for ( i=n-1; i>=0; i-- ){
487     v    = aa + diag[i] - 1;
488     vi   = aj + diag[i] - 1;
489     nz   = diag[i] - ai[i];
490     s1   = t[i];
491     while (nz--) {
492       t[*vi--]   -=  (*v--)*s1;
493     }
494   }
495 
496   /* copy t into x according to permutation */
497   for ( i=0; i<n; i++ ) {
498     x[r[i]]   = t[i];
499   }
500 
501   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505   PLogFlops(2*(a->nz) - a->n);
506   PetscFunctionReturn(0);
507 }
508 
509 #undef __FUNC__
510 #define __FUNC__ "MatSolveTrans_SeqBAIJ_2"
511 int MatSolveTrans_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
512 {
513   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514   IS              iscol=a->col,isrow=a->row;
515   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516   int             *diag = a->diag,ii,ic,ir,oidx;
517   MatScalar       *aa=a->a,*v;
518   Scalar          s1,s2,x1,x2;
519   Scalar          *x,*b,*t;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   t  = a->solve_work;
525 
526   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528 
529   /* copy the b into temp work space according to permutation */
530   ii = 0;
531   for ( i=0; i<n; i++ ) {
532     ic      = 2*c[i];
533     t[ii]   = b[ic];
534     t[ii+1] = b[ic+1];
535     ii += 2;
536   }
537 
538   /* forward solve the U^T */
539   idx = 0;
540   for ( i=0; i<n; i++ ) {
541 
542     v     = aa + 4*diag[i];
543     /* multiply by the inverse of the block diagonal */
544     x1    = t[idx];   x2 = t[1+idx];
545     s1 = v[0]*x1  +  v[1]*x2;
546     s2 = v[2]*x1  +  v[3]*x2;
547     v += 4;
548 
549     vi    = aj + diag[i] + 1;
550     nz    = ai[i+1] - diag[i] - 1;
551     while (nz--) {
552       oidx = 2*(*vi++);
553       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555       v  += 4;
556     }
557     t[idx]   = s1;t[1+idx] = s2;
558     idx += 2;
559   }
560   /* backward solve the L^T */
561   for ( i=n-1; i>=0; i-- ){
562     v    = aa + 4*diag[i] - 4;
563     vi   = aj + diag[i] - 1;
564     nz   = diag[i] - ai[i];
565     idt  = 2*i;
566     s1 = t[idt];  s2 = t[1+idt];
567     while (nz--) {
568       idx   = 2*(*vi--);
569       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571       v -= 4;
572     }
573   }
574 
575   /* copy t into x according to permutation */
576   ii = 0;
577   for ( i=0; i<n; i++ ) {
578     ir      = 2*r[i];
579     x[ir]   = t[ii];
580     x[ir+1] = t[ii+1];
581     ii += 2;
582   }
583 
584   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PLogFlops(2*4*(a->nz) - 2*a->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNC__
593 #define __FUNC__ "MatSolveTrans_SeqBAIJ_3"
594 int MatSolveTrans_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599   int             *diag = a->diag,ii,ic,ir,oidx;
600   MatScalar       *aa=a->a,*v;
601   Scalar          s1,s2,s3,x1,x2,x3;
602   Scalar          *x,*b,*t;
603 
604   PetscFunctionBegin;
605   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607   t  = a->solve_work;
608 
609   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611 
612   /* copy the b into temp work space according to permutation */
613   ii = 0;
614   for ( i=0; i<n; i++ ) {
615     ic      = 3*c[i];
616     t[ii]   = b[ic];
617     t[ii+1] = b[ic+1];
618     t[ii+2] = b[ic+2];
619     ii += 3;
620   }
621 
622   /* forward solve the U^T */
623   idx = 0;
624   for ( i=0; i<n; i++ ) {
625 
626     v     = aa + 9*diag[i];
627     /* multiply by the inverse of the block diagonal */
628     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632     v += 9;
633 
634     vi    = aj + diag[i] + 1;
635     nz    = ai[i+1] - diag[i] - 1;
636     while (nz--) {
637       oidx = 3*(*vi++);
638       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641       v  += 9;
642     }
643     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644     idx += 3;
645   }
646   /* backward solve the L^T */
647   for ( i=n-1; i>=0; i-- ){
648     v    = aa + 9*diag[i] - 9;
649     vi   = aj + diag[i] - 1;
650     nz   = diag[i] - ai[i];
651     idt  = 3*i;
652     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653     while (nz--) {
654       idx   = 3*(*vi--);
655       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v -= 9;
659     }
660   }
661 
662   /* copy t into x according to permutation */
663   ii = 0;
664   for ( i=0; i<n; i++ ) {
665     ir      = 3*r[i];
666     x[ir]   = t[ii];
667     x[ir+1] = t[ii+1];
668     x[ir+2] = t[ii+2];
669     ii += 3;
670   }
671 
672   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   PLogFlops(2*9*(a->nz) - 3*a->n);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNC__
681 #define __FUNC__ "MatSolveTrans_SeqBAIJ_4"
682 int MatSolveTrans_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685   IS              iscol=a->col,isrow=a->row;
686   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687   int             *diag = a->diag,ii,ic,ir,oidx;
688   MatScalar       *aa=a->a,*v;
689   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
690   Scalar          *x,*b,*t;
691 
692   PetscFunctionBegin;
693   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695   t  = a->solve_work;
696 
697   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699 
700   /* copy the b into temp work space according to permutation */
701   ii = 0;
702   for ( i=0; i<n; i++ ) {
703     ic      = 4*c[i];
704     t[ii]   = b[ic];
705     t[ii+1] = b[ic+1];
706     t[ii+2] = b[ic+2];
707     t[ii+3] = b[ic+3];
708     ii += 4;
709   }
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for ( i=0; i<n; i++ ) {
714 
715     v     = aa + 16*diag[i];
716     /* multiply by the inverse of the block diagonal */
717     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722     v += 16;
723 
724     vi    = aj + diag[i] + 1;
725     nz    = ai[i+1] - diag[i] - 1;
726     while (nz--) {
727       oidx = 4*(*vi++);
728       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732       v  += 16;
733     }
734     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735     idx += 4;
736   }
737   /* backward solve the L^T */
738   for ( i=n-1; i>=0; i-- ){
739     v    = aa + 16*diag[i] - 16;
740     vi   = aj + diag[i] - 1;
741     nz   = diag[i] - ai[i];
742     idt  = 4*i;
743     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744     while (nz--) {
745       idx   = 4*(*vi--);
746       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v -= 16;
751     }
752   }
753 
754   /* copy t into x according to permutation */
755   ii = 0;
756   for ( i=0; i<n; i++ ) {
757     ir      = 4*r[i];
758     x[ir]   = t[ii];
759     x[ir+1] = t[ii+1];
760     x[ir+2] = t[ii+2];
761     x[ir+3] = t[ii+3];
762     ii += 4;
763   }
764 
765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   PLogFlops(2*16*(a->nz) - 4*a->n);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNC__
774 #define __FUNC__ "MatSolveTrans_SeqBAIJ_5"
775 int MatSolveTrans_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778   IS              iscol=a->col,isrow=a->row;
779   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780   int             *diag = a->diag,ii,ic,ir,oidx;
781   MatScalar       *aa=a->a,*v;
782   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783   Scalar          *x,*b,*t;
784 
785   PetscFunctionBegin;
786   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788   t  = a->solve_work;
789 
790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792 
793   /* copy the b into temp work space according to permutation */
794   ii = 0;
795   for ( i=0; i<n; i++ ) {
796     ic      = 5*c[i];
797     t[ii]   = b[ic];
798     t[ii+1] = b[ic+1];
799     t[ii+2] = b[ic+2];
800     t[ii+3] = b[ic+3];
801     t[ii+4] = b[ic+4];
802     ii += 5;
803   }
804 
805   /* forward solve the U^T */
806   idx = 0;
807   for ( i=0; i<n; i++ ) {
808 
809     v     = aa + 25*diag[i];
810     /* multiply by the inverse of the block diagonal */
811     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817     v += 25;
818 
819     vi    = aj + diag[i] + 1;
820     nz    = ai[i+1] - diag[i] - 1;
821     while (nz--) {
822       oidx = 5*(*vi++);
823       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828       v  += 25;
829     }
830     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831     idx += 5;
832   }
833   /* backward solve the L^T */
834   for ( i=n-1; i>=0; i-- ){
835     v    = aa + 25*diag[i] - 25;
836     vi   = aj + diag[i] - 1;
837     nz   = diag[i] - ai[i];
838     idt  = 5*i;
839     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840     while (nz--) {
841       idx   = 5*(*vi--);
842       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v -= 25;
848     }
849   }
850 
851   /* copy t into x according to permutation */
852   ii = 0;
853   for ( i=0; i<n; i++ ) {
854     ir      = 5*r[i];
855     x[ir]   = t[ii];
856     x[ir+1] = t[ii+1];
857     x[ir+2] = t[ii+2];
858     x[ir+3] = t[ii+3];
859     x[ir+4] = t[ii+4];
860     ii += 5;
861   }
862 
863   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867   PLogFlops(2*25*(a->nz) - 5*a->n);
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNC__
872 #define __FUNC__ "MatSolveTrans_SeqBAIJ_6"
873 int MatSolveTrans_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
874 {
875   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876   IS              iscol=a->col,isrow=a->row;
877   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878   int             *diag = a->diag,ii,ic,ir,oidx;
879   MatScalar       *aa=a->a,*v;
880   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881   Scalar          *x,*b,*t;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886   t  = a->solve_work;
887 
888   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890 
891   /* copy the b into temp work space according to permutation */
892   ii = 0;
893   for ( i=0; i<n; i++ ) {
894     ic      = 6*c[i];
895     t[ii]   = b[ic];
896     t[ii+1] = b[ic+1];
897     t[ii+2] = b[ic+2];
898     t[ii+3] = b[ic+3];
899     t[ii+4] = b[ic+4];
900     t[ii+5] = b[ic+5];
901     ii += 6;
902   }
903 
904   /* forward solve the U^T */
905   idx = 0;
906   for ( i=0; i<n; i++ ) {
907 
908     v     = aa + 36*diag[i];
909     /* multiply by the inverse of the block diagonal */
910     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911     x6    = t[5+idx];
912     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918     v += 36;
919 
920     vi    = aj + diag[i] + 1;
921     nz    = ai[i+1] - diag[i] - 1;
922     while (nz--) {
923       oidx = 6*(*vi++);
924       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930       v  += 36;
931     }
932     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933     t[5+idx] = s6;
934     idx += 6;
935   }
936   /* backward solve the L^T */
937   for ( i=n-1; i>=0; i-- ){
938     v    = aa + 36*diag[i] - 36;
939     vi   = aj + diag[i] - 1;
940     nz   = diag[i] - ai[i];
941     idt  = 6*i;
942     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943     s6 = t[5+idt];
944     while (nz--) {
945       idx   = 6*(*vi--);
946       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952       v -= 36;
953     }
954   }
955 
956   /* copy t into x according to permutation */
957   ii = 0;
958   for ( i=0; i<n; i++ ) {
959     ir      = 6*r[i];
960     x[ir]   = t[ii];
961     x[ir+1] = t[ii+1];
962     x[ir+2] = t[ii+2];
963     x[ir+3] = t[ii+3];
964     x[ir+4] = t[ii+4];
965     x[ir+5] = t[ii+5];
966     ii += 6;
967   }
968 
969   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973   PLogFlops(2*36*(a->nz) - 6*a->n);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNC__
978 #define __FUNC__ "MatSolveTrans_SeqBAIJ_7"
979 int MatSolveTrans_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982   IS              iscol=a->col,isrow=a->row;
983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984   int             *diag = a->diag,ii,ic,ir,oidx;
985   MatScalar       *aa=a->a,*v;
986   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987   Scalar          *x,*b,*t;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   t  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996 
997   /* copy the b into temp work space according to permutation */
998   ii = 0;
999   for ( i=0; i<n; i++ ) {
1000     ic      = 7*c[i];
1001     t[ii]   = b[ic];
1002     t[ii+1] = b[ic+1];
1003     t[ii+2] = b[ic+2];
1004     t[ii+3] = b[ic+3];
1005     t[ii+4] = b[ic+4];
1006     t[ii+5] = b[ic+5];
1007     t[ii+6] = b[ic+6];
1008     ii += 7;
1009   }
1010 
1011   /* forward solve the U^T */
1012   idx = 0;
1013   for ( i=0; i<n; i++ ) {
1014 
1015     v     = aa + 49*diag[i];
1016     /* multiply by the inverse of the block diagonal */
1017     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018     x6    = t[5+idx]; x7 = t[6+idx];
1019     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026     v += 49;
1027 
1028     vi    = aj + diag[i] + 1;
1029     nz    = ai[i+1] - diag[i] - 1;
1030     while (nz--) {
1031       oidx = 7*(*vi++);
1032       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039       v  += 49;
1040     }
1041     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042     t[5+idx] = s6;t[6+idx] = s7;
1043     idx += 7;
1044   }
1045   /* backward solve the L^T */
1046   for ( i=n-1; i>=0; i-- ){
1047     v    = aa + 49*diag[i] - 49;
1048     vi   = aj + diag[i] - 1;
1049     nz   = diag[i] - ai[i];
1050     idt  = 7*i;
1051     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052     s6 = t[5+idt];s7 = t[6+idt];
1053     while (nz--) {
1054       idx   = 7*(*vi--);
1055       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062       v -= 49;
1063     }
1064   }
1065 
1066   /* copy t into x according to permutation */
1067   ii = 0;
1068   for ( i=0; i<n; i++ ) {
1069     ir      = 7*r[i];
1070     x[ir]   = t[ii];
1071     x[ir+1] = t[ii+1];
1072     x[ir+2] = t[ii+2];
1073     x[ir+3] = t[ii+3];
1074     x[ir+4] = t[ii+4];
1075     x[ir+5] = t[ii+5];
1076     x[ir+6] = t[ii+6];
1077     ii += 7;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084   PLogFlops(2*49*(a->nz) - 7*a->n);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* ----------------------------------------------------------- */
1089 #undef __FUNC__
1090 #define __FUNC__ "MatSolve_SeqBAIJ_N"
1091 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1092 {
1093   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1094   IS              iscol=a->col,isrow=a->row;
1095   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1096   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1097   MatScalar       *aa=a->a,*v;
1098   Scalar          *x,*b,*s,*t,*ls;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103   t  = a->solve_work;
1104 
1105   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1106   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1107 
1108   /* forward solve the lower triangular */
1109   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1110   for ( i=1; i<n; i++ ) {
1111     v   = aa + bs2*ai[i];
1112     vi  = aj + ai[i];
1113     nz  = a->diag[i] - ai[i];
1114     s = t + bs*i;
1115     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1116     while (nz--) {
1117       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1118       v += bs2;
1119     }
1120   }
1121   /* backward solve the upper triangular */
1122   ls = a->solve_work + a->n;
1123   for ( i=n-1; i>=0; i-- ){
1124     v   = aa + bs2*(a->diag[i] + 1);
1125     vi  = aj + a->diag[i] + 1;
1126     nz  = ai[i+1] - a->diag[i] - 1;
1127     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1137   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140   PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatSolve_SeqBAIJ_7"
1146 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1149   IS              iscol=a->col,isrow=a->row;
1150   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1151   int             *diag = a->diag;
1152   MatScalar       *aa=a->a,*v;
1153   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154   Scalar          *x,*b,*t;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   t  = a->solve_work;
1160 
1161   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1162   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1163 
1164   /* forward solve the lower triangular */
1165   idx    = 7*(*r++);
1166   t[0] = b[idx];   t[1] = b[1+idx];
1167   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168   t[5] = b[5+idx]; t[6] = b[6+idx];
1169 
1170   for ( i=1; i<n; i++ ) {
1171     v     = aa + 49*ai[i];
1172     vi    = aj + ai[i];
1173     nz    = diag[i] - ai[i];
1174     idx   = 7*(*r++);
1175     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1177     while (nz--) {
1178       idx   = 7*(*vi++);
1179       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180       x4    = t[3+idx];x5 = t[4+idx];
1181       x6    = t[5+idx];x7 = t[6+idx];
1182       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1189       v += 49;
1190     }
1191     idx = 7*i;
1192     t[idx]   = s1;t[1+idx] = s2;
1193     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194     t[5+idx] = s6;t[6+idx] = s7;
1195   }
1196   /* backward solve the upper triangular */
1197   for ( i=n-1; i>=0; i-- ){
1198     v    = aa + 49*diag[i] + 49;
1199     vi   = aj + diag[i] + 1;
1200     nz   = ai[i+1] - diag[i] - 1;
1201     idt  = 7*i;
1202     s1 = t[idt];  s2 = t[1+idt];
1203     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204     s6 = t[5+idt];s7 = t[6+idt];
1205     while (nz--) {
1206       idx   = 7*(*vi++);
1207       x1    = t[idx];   x2 = t[1+idx];
1208       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209       x6    = t[5+idx]; x7 = t[6+idx];
1210       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1217       v += 49;
1218     }
1219     idc = 7*(*c--);
1220     v   = aa + 49*diag[i];
1221     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1235   }
1236 
1237   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1238   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241   PLogFlops(2*49*(a->nz) - 7*a->n);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNC__
1246 #define __FUNC__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1247 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1248 {
1249   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1250   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1251   int             ierr,*diag = a->diag,jdx;
1252   MatScalar       *aa=a->a,*v;
1253   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1258   /* forward solve the lower triangular */
1259   idx    = 0;
1260   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1261   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1262   x[6] = b[6+idx];
1263   for ( i=1; i<n; i++ ) {
1264     v     =  aa + 49*ai[i];
1265     vi    =  aj + ai[i];
1266     nz    =  diag[i] - ai[i];
1267     idx   =  7*i;
1268     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270     s7  =  b[6+idx];
1271     while (nz--) {
1272       jdx   = 7*(*vi++);
1273       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1274       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1275       x7    = x[6+jdx];
1276       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1283       v += 49;
1284      }
1285     x[idx]   = s1;
1286     x[1+idx] = s2;
1287     x[2+idx] = s3;
1288     x[3+idx] = s4;
1289     x[4+idx] = s5;
1290     x[5+idx] = s6;
1291     x[6+idx] = s7;
1292   }
1293   /* backward solve the upper triangular */
1294   for ( i=n-1; i>=0; i-- ){
1295     v    = aa + 49*diag[i] + 49;
1296     vi   = aj + diag[i] + 1;
1297     nz   = ai[i+1] - diag[i] - 1;
1298     idt  = 7*i;
1299     s1 = x[idt];   s2 = x[1+idt];
1300     s3 = x[2+idt]; s4 = x[3+idt];
1301     s5 = x[4+idt]; s6 = x[5+idt];
1302     s7 = x[6+idt];
1303     while (nz--) {
1304       idx   = 7*(*vi++);
1305       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1306       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1307       x7    = x[6+idx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316     }
1317     v        = aa + 49*diag[i];
1318     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1332   }
1333 
1334   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   PLogFlops(2*36*(a->nz) - 6*a->n);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNC__
1341 #define __FUNC__ "MatSolve_SeqBAIJ_6"
1342 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1343 {
1344   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1345   IS              iscol=a->col,isrow=a->row;
1346   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1347   int             *diag = a->diag;
1348   MatScalar       *aa=a->a,*v;
1349   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1358 
1359   /* forward solve the lower triangular */
1360   idx    = 6*(*r++);
1361   t[0] = b[idx];   t[1] = b[1+idx];
1362   t[2] = b[2+idx]; t[3] = b[3+idx];
1363   t[4] = b[4+idx]; t[5] = b[5+idx];
1364   for ( i=1; i<n; i++ ) {
1365     v     = aa + 36*ai[i];
1366     vi    = aj + ai[i];
1367     nz    = diag[i] - ai[i];
1368     idx   = 6*(*r++);
1369     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370     s5  = b[4+idx]; s6 = b[5+idx];
1371     while (nz--) {
1372       idx   = 6*(*vi++);
1373       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1381       v += 36;
1382     }
1383     idx = 6*i;
1384     t[idx]   = s1;t[1+idx] = s2;
1385     t[2+idx] = s3;t[3+idx] = s4;
1386     t[4+idx] = s5;t[5+idx] = s6;
1387   }
1388   /* backward solve the upper triangular */
1389   for ( i=n-1; i>=0; i-- ){
1390     v    = aa + 36*diag[i] + 36;
1391     vi   = aj + diag[i] + 1;
1392     nz   = ai[i+1] - diag[i] - 1;
1393     idt  = 6*i;
1394     s1 = t[idt];  s2 = t[1+idt];
1395     s3 = t[2+idt];s4 = t[3+idt];
1396     s5 = t[4+idt];s6 = t[5+idt];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx];
1400       x3    = t[2+idx]; x4 = t[3+idx];
1401       x5    = t[4+idx]; x6 = t[5+idx];
1402       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1408       v += 36;
1409     }
1410     idc = 6*(*c--);
1411     v   = aa + 36*diag[i];
1412     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423                                  v[23]*s4+v[29]*s5+v[35]*s6;
1424   }
1425 
1426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1428   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430   PLogFlops(2*36*(a->nz) - 6*a->n);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNC__
1435 #define __FUNC__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1436 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1437 {
1438   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1439   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1440   int             ierr,*diag = a->diag,jdx;
1441   MatScalar       *aa=a->a,*v;
1442   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   /* forward solve the lower triangular */
1448   idx    = 0;
1449   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1450   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1451   for ( i=1; i<n; i++ ) {
1452     v     =  aa + 36*ai[i];
1453     vi    =  aj + ai[i];
1454     nz    =  diag[i] - ai[i];
1455     idx   =  6*i;
1456     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1458     while (nz--) {
1459       jdx   = 6*(*vi++);
1460       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1461       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1468       v += 36;
1469      }
1470     x[idx]   = s1;
1471     x[1+idx] = s2;
1472     x[2+idx] = s3;
1473     x[3+idx] = s4;
1474     x[4+idx] = s5;
1475     x[5+idx] = s6;
1476   }
1477   /* backward solve the upper triangular */
1478   for ( i=n-1; i>=0; i-- ){
1479     v    = aa + 36*diag[i] + 36;
1480     vi   = aj + diag[i] + 1;
1481     nz   = ai[i+1] - diag[i] - 1;
1482     idt  = 6*i;
1483     s1 = x[idt];   s2 = x[1+idt];
1484     s3 = x[2+idt]; s4 = x[3+idt];
1485     s5 = x[4+idt]; s6 = x[5+idt];
1486     while (nz--) {
1487       idx   = 6*(*vi++);
1488       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1489       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497     }
1498     v        = aa + 36*diag[i];
1499     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1505   }
1506 
1507   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   PLogFlops(2*36*(a->nz) - 6*a->n);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNC__
1514 #define __FUNC__ "MatSolve_SeqBAIJ_5"
1515 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1518   IS              iscol=a->col,isrow=a->row;
1519   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1520   int             *diag = a->diag;
1521   MatScalar       *aa=a->a,*v;
1522   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1523 
1524   PetscFunctionBegin;
1525   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527   t  = a->solve_work;
1528 
1529   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1530   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1531 
1532   /* forward solve the lower triangular */
1533   idx    = 5*(*r++);
1534   t[0] = b[idx];   t[1] = b[1+idx];
1535   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1536   for ( i=1; i<n; i++ ) {
1537     v     = aa + 25*ai[i];
1538     vi    = aj + ai[i];
1539     nz    = diag[i] - ai[i];
1540     idx   = 5*(*r++);
1541     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542     s5  = b[4+idx];
1543     while (nz--) {
1544       idx   = 5*(*vi++);
1545       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546       x4    = t[3+idx];x5 = t[4+idx];
1547       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1552       v += 25;
1553     }
1554     idx = 5*i;
1555     t[idx]   = s1;t[1+idx] = s2;
1556     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1557   }
1558   /* backward solve the upper triangular */
1559   for ( i=n-1; i>=0; i-- ){
1560     v    = aa + 25*diag[i] + 25;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 5*i;
1564     s1 = t[idt];  s2 = t[1+idt];
1565     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1566     while (nz--) {
1567       idx   = 5*(*vi++);
1568       x1    = t[idx];   x2 = t[1+idx];
1569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1575       v += 25;
1576     }
1577     idc = 5*(*c--);
1578     v   = aa + 25*diag[i];
1579     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580                                  v[15]*s4+v[20]*s5;
1581     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582                                  v[16]*s4+v[21]*s5;
1583     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584                                  v[17]*s4+v[22]*s5;
1585     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586                                  v[18]*s4+v[23]*s5;
1587     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588                                  v[19]*s4+v[24]*s5;
1589   }
1590 
1591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   PLogFlops(2*25*(a->nz) - 5*a->n);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNC__
1600 #define __FUNC__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1601 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1604   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1605   int             ierr,*diag = a->diag,jdx;
1606   MatScalar       *aa=a->a,*v;
1607   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1612   /* forward solve the lower triangular */
1613   idx    = 0;
1614   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1615   for ( i=1; i<n; i++ ) {
1616     v     =  aa + 25*ai[i];
1617     vi    =  aj + ai[i];
1618     nz    =  diag[i] - ai[i];
1619     idx   =  5*i;
1620     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1621     while (nz--) {
1622       jdx   = 5*(*vi++);
1623       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1629       v    += 25;
1630     }
1631     x[idx]   = s1;
1632     x[1+idx] = s2;
1633     x[2+idx] = s3;
1634     x[3+idx] = s4;
1635     x[4+idx] = s5;
1636   }
1637   /* backward solve the upper triangular */
1638   for ( i=n-1; i>=0; i-- ){
1639     v    = aa + 25*diag[i] + 25;
1640     vi   = aj + diag[i] + 1;
1641     nz   = ai[i+1] - diag[i] - 1;
1642     idt  = 5*i;
1643     s1 = x[idt];  s2 = x[1+idt];
1644     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1645     while (nz--) {
1646       idx   = 5*(*vi++);
1647       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1653       v    += 25;
1654     }
1655     v        = aa + 25*diag[i];
1656     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1661   }
1662 
1663   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665   PLogFlops(2*25*(a->nz) - 5*a->n);
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 #undef __FUNC__
1670 #define __FUNC__ "MatSolve_SeqBAIJ_4"
1671 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1672 {
1673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1674   IS              iscol=a->col,isrow=a->row;
1675   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1676   int             *diag = a->diag;
1677   MatScalar       *aa=a->a,*v;
1678   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1679 
1680   PetscFunctionBegin;
1681   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683   t  = a->solve_work;
1684 
1685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1687 
1688   /* forward solve the lower triangular */
1689   idx    = 4*(*r++);
1690   t[0] = b[idx];   t[1] = b[1+idx];
1691   t[2] = b[2+idx]; t[3] = b[3+idx];
1692   for ( i=1; i<n; i++ ) {
1693     v     = aa + 16*ai[i];
1694     vi    = aj + ai[i];
1695     nz    = diag[i] - ai[i];
1696     idx   = 4*(*r++);
1697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1698     while (nz--) {
1699       idx   = 4*(*vi++);
1700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1705       v    += 16;
1706     }
1707     idx        = 4*i;
1708     t[idx]   = s1;t[1+idx] = s2;
1709     t[2+idx] = s3;t[3+idx] = s4;
1710   }
1711   /* backward solve the upper triangular */
1712   for ( i=n-1; i>=0; i-- ){
1713     v    = aa + 16*diag[i] + 16;
1714     vi   = aj + diag[i] + 1;
1715     nz   = ai[i+1] - diag[i] - 1;
1716     idt  = 4*i;
1717     s1 = t[idt];  s2 = t[1+idt];
1718     s3 = t[2+idt];s4 = t[3+idt];
1719     while (nz--) {
1720       idx   = 4*(*vi++);
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1727       v += 16;
1728     }
1729     idc      = 4*(*c--);
1730     v        = aa + 16*diag[i];
1731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   PLogFlops(2*16*(a->nz) - 4*a->n);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 
1746 /*
1747       Special case where the matrix was ILU(0) factored in the natural
1748    ordering. This eliminates the need for the column and row permutation.
1749 */
1750 #undef __FUNC__
1751 #define __FUNC__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
1752 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1753 {
1754   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1755   int             n=a->mbs,*ai=a->i,*aj=a->j;
1756   int             ierr,*diag = a->diag;
1757   MatScalar       *aa=a->a;
1758   Scalar          *x,*b;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763 
1764 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1765   {
1766     static Scalar w[2000]; /* very BAD need to fix */
1767     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1768   }
1769 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1770   {
1771     static Scalar w[2000]; /* very BAD need to fix */
1772     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1773   }
1774 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1775   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1776 #else
1777   {
1778     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1779     MatScalar *v;
1780     int       jdx,idt,idx,nz,*vi,i;
1781 
1782   /* forward solve the lower triangular */
1783   idx    = 0;
1784   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1785   for ( i=1; i<n; i++ ) {
1786     v     =  aa      + 16*ai[i];
1787     vi    =  aj      + ai[i];
1788     nz    =  diag[i] - ai[i];
1789     idx   +=  4;
1790     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1791     while (nz--) {
1792       jdx   = 4*(*vi++);
1793       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1794       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1795       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1796       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1797       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1798       v    += 16;
1799     }
1800     x[idx]   = s1;
1801     x[1+idx] = s2;
1802     x[2+idx] = s3;
1803     x[3+idx] = s4;
1804   }
1805   /* backward solve the upper triangular */
1806   for ( i=n-1; i>=0; i-- ){
1807     v    = aa + 16*diag[i] + 16;
1808     vi   = aj + diag[i] + 1;
1809     nz   = ai[i+1] - diag[i] - 1;
1810     idt  = 4*i;
1811     s1 = x[idt];  s2 = x[1+idt];
1812     s3 = x[2+idt];s4 = x[3+idt];
1813     while (nz--) {
1814       idx   = 4*(*vi++);
1815       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1816       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1817       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1818       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1819       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1820       v    += 16;
1821     }
1822     v        = aa + 16*diag[i];
1823     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1824     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1825     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1826     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1827   }
1828   }
1829 #endif
1830 
1831   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1832   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1833   PLogFlops(2*16*(a->nz) - 4*a->n);
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 #undef __FUNC__
1838 #define __FUNC__ "MatSolve_SeqBAIJ_3"
1839 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1840 {
1841   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1842   IS              iscol=a->col,isrow=a->row;
1843   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1844   int             *diag = a->diag;
1845   MatScalar       *aa=a->a,*v;
1846   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
1847 
1848   PetscFunctionBegin;
1849   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1850   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1851   t  = a->solve_work;
1852 
1853   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1854   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1855 
1856   /* forward solve the lower triangular */
1857   idx    = 3*(*r++);
1858   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1859   for ( i=1; i<n; i++ ) {
1860     v     = aa + 9*ai[i];
1861     vi    = aj + ai[i];
1862     nz    = diag[i] - ai[i];
1863     idx   = 3*(*r++);
1864     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1865     while (nz--) {
1866       idx   = 3*(*vi++);
1867       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1868       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1869       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1870       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1871       v += 9;
1872     }
1873     idx = 3*i;
1874     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1875   }
1876   /* backward solve the upper triangular */
1877   for ( i=n-1; i>=0; i-- ){
1878     v    = aa + 9*diag[i] + 9;
1879     vi   = aj + diag[i] + 1;
1880     nz   = ai[i+1] - diag[i] - 1;
1881     idt  = 3*i;
1882     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1883     while (nz--) {
1884       idx   = 3*(*vi++);
1885       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1886       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1887       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1888       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1889       v += 9;
1890     }
1891     idc = 3*(*c--);
1892     v   = aa + 9*diag[i];
1893     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1894     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1895     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1896   }
1897   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1898   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1899   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1900   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1901   PLogFlops(2*9*(a->nz) - 3*a->n);
1902   PetscFunctionReturn(0);
1903 }
1904 
1905 /*
1906       Special case where the matrix was ILU(0) factored in the natural
1907    ordering. This eliminates the need for the column and row permutation.
1908 */
1909 #undef __FUNC__
1910 #define __FUNC__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
1911 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1912 {
1913   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1914   int             n=a->mbs,*ai=a->i,*aj=a->j;
1915   int             ierr,*diag = a->diag;
1916   MatScalar       *aa=a->a, *v;
1917   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
1918   int             jdx,idt,idx,nz,*vi,i;
1919 
1920   PetscFunctionBegin;
1921   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1922   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1923 
1924 
1925   /* forward solve the lower triangular */
1926   idx    = 0;
1927   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
1928   for ( i=1; i<n; i++ ) {
1929     v     =  aa      + 9*ai[i];
1930     vi    =  aj      + ai[i];
1931     nz    =  diag[i] - ai[i];
1932     idx   +=  3;
1933     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1934     while (nz--) {
1935       jdx   = 3*(*vi++);
1936       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1937       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1938       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1939       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1940       v    += 9;
1941     }
1942     x[idx]   = s1;
1943     x[1+idx] = s2;
1944     x[2+idx] = s3;
1945   }
1946   /* backward solve the upper triangular */
1947   for ( i=n-1; i>=0; i-- ){
1948     v    = aa + 9*diag[i] + 9;
1949     vi   = aj + diag[i] + 1;
1950     nz   = ai[i+1] - diag[i] - 1;
1951     idt  = 3*i;
1952     s1 = x[idt];  s2 = x[1+idt];
1953     s3 = x[2+idt];
1954     while (nz--) {
1955       idx   = 3*(*vi++);
1956       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1957       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1958       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1959       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1960       v    += 9;
1961     }
1962     v        = aa +  9*diag[i];
1963     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1964     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1965     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1966   }
1967 
1968   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1969   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1970   PLogFlops(2*9*(a->nz) - 3*a->n);
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 #undef __FUNC__
1975 #define __FUNC__ "MatSolve_SeqBAIJ_2"
1976 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1977 {
1978   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1979   IS              iscol=a->col,isrow=a->row;
1980   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1981   int             *diag = a->diag;
1982   MatScalar       *aa=a->a,*v;
1983   Scalar          *x,*b,s1,s2,x1,x2,*t;
1984 
1985   PetscFunctionBegin;
1986   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1987   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1988   t  = a->solve_work;
1989 
1990   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1991   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1992 
1993   /* forward solve the lower triangular */
1994   idx    = 2*(*r++);
1995   t[0] = b[idx]; t[1] = b[1+idx];
1996   for ( i=1; i<n; i++ ) {
1997     v     = aa + 4*ai[i];
1998     vi    = aj + ai[i];
1999     nz    = diag[i] - ai[i];
2000     idx   = 2*(*r++);
2001     s1  = b[idx]; s2 = b[1+idx];
2002     while (nz--) {
2003       idx   = 2*(*vi++);
2004       x1    = t[idx]; x2 = t[1+idx];
2005       s1 -= v[0]*x1 + v[2]*x2;
2006       s2 -= v[1]*x1 + v[3]*x2;
2007       v += 4;
2008     }
2009     idx = 2*i;
2010     t[idx] = s1; t[1+idx] = s2;
2011   }
2012   /* backward solve the upper triangular */
2013   for ( i=n-1; i>=0; i-- ){
2014     v    = aa + 4*diag[i] + 4;
2015     vi   = aj + diag[i] + 1;
2016     nz   = ai[i+1] - diag[i] - 1;
2017     idt  = 2*i;
2018     s1 = t[idt]; s2 = t[1+idt];
2019     while (nz--) {
2020       idx   = 2*(*vi++);
2021       x1    = t[idx]; x2 = t[1+idx];
2022       s1 -= v[0]*x1 + v[2]*x2;
2023       s2 -= v[1]*x1 + v[3]*x2;
2024       v += 4;
2025     }
2026     idc = 2*(*c--);
2027     v   = aa + 4*diag[i];
2028     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2029     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2030   }
2031   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2032   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2033   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2034   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2035   PLogFlops(2*4*(a->nz) - 2*a->n);
2036   PetscFunctionReturn(0);
2037 }
2038 
2039 /*
2040       Special case where the matrix was ILU(0) factored in the natural
2041    ordering. This eliminates the need for the column and row permutation.
2042 */
2043 #undef __FUNC__
2044 #define __FUNC__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2045 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2046 {
2047   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2048   int             n=a->mbs,*ai=a->i,*aj=a->j;
2049   int             ierr,*diag = a->diag;
2050   MatScalar       *aa=a->a,*v;
2051   Scalar          *x,*b,s1,s2,x1,x2;
2052   int             jdx,idt,idx,nz,*vi,i;
2053 
2054   PetscFunctionBegin;
2055   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2056   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2057 
2058   /* forward solve the lower triangular */
2059   idx    = 0;
2060   x[0]   = b[0]; x[1] = b[1];
2061   for ( i=1; i<n; i++ ) {
2062     v     =  aa      + 4*ai[i];
2063     vi    =  aj      + ai[i];
2064     nz    =  diag[i] - ai[i];
2065     idx   +=  2;
2066     s1  =  b[idx];s2 = b[1+idx];
2067     while (nz--) {
2068       jdx   = 2*(*vi++);
2069       x1    = x[jdx];x2 = x[1+jdx];
2070       s1 -= v[0]*x1 + v[2]*x2;
2071       s2 -= v[1]*x1 + v[3]*x2;
2072       v    += 4;
2073     }
2074     x[idx]   = s1;
2075     x[1+idx] = s2;
2076   }
2077   /* backward solve the upper triangular */
2078   for ( i=n-1; i>=0; i-- ){
2079     v    = aa + 4*diag[i] + 4;
2080     vi   = aj + diag[i] + 1;
2081     nz   = ai[i+1] - diag[i] - 1;
2082     idt  = 2*i;
2083     s1 = x[idt];  s2 = x[1+idt];
2084     while (nz--) {
2085       idx   = 2*(*vi++);
2086       x1    = x[idx];   x2 = x[1+idx];
2087       s1 -= v[0]*x1 + v[2]*x2;
2088       s2 -= v[1]*x1 + v[3]*x2;
2089       v    += 4;
2090     }
2091     v        = aa +  4*diag[i];
2092     x[idt]   = v[0]*s1 + v[2]*s2;
2093     x[1+idt] = v[1]*s1 + v[3]*s2;
2094   }
2095 
2096   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098   PLogFlops(2*4*(a->nz) - 2*a->n);
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 #undef __FUNC__
2103 #define __FUNC__ "MatSolve_SeqBAIJ_1"
2104 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2105 {
2106   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2107   IS              iscol=a->col,isrow=a->row;
2108   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2109   int             *diag = a->diag;
2110   MatScalar       *aa=a->a,*v;
2111   Scalar          *x,*b,s1,*t;
2112 
2113   PetscFunctionBegin;
2114   if (!n) PetscFunctionReturn(0);
2115 
2116   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2117   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2118   t  = a->solve_work;
2119 
2120   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2121   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2122 
2123   /* forward solve the lower triangular */
2124   t[0] = b[*r++];
2125   for ( i=1; i<n; i++ ) {
2126     v     = aa + ai[i];
2127     vi    = aj + ai[i];
2128     nz    = diag[i] - ai[i];
2129     s1  = b[*r++];
2130     while (nz--) {
2131       s1 -= (*v++)*t[*vi++];
2132     }
2133     t[i] = s1;
2134   }
2135   /* backward solve the upper triangular */
2136   for ( i=n-1; i>=0; i-- ){
2137     v    = aa + diag[i] + 1;
2138     vi   = aj + diag[i] + 1;
2139     nz   = ai[i+1] - diag[i] - 1;
2140     s1 = t[i];
2141     while (nz--) {
2142       s1 -= (*v++)*t[*vi++];
2143     }
2144     x[*c--] = t[i] = aa[diag[i]]*s1;
2145   }
2146 
2147   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2148   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2149   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2150   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2151   PLogFlops(2*1*(a->nz) - a->n);
2152   PetscFunctionReturn(0);
2153 }
2154 /*
2155       Special case where the matrix was ILU(0) factored in the natural
2156    ordering. This eliminates the need for the column and row permutation.
2157 */
2158 #undef __FUNC__
2159 #define __FUNC__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2160 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2161 {
2162   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2163   int             n=a->mbs,*ai=a->i,*aj=a->j;
2164   int             ierr,*diag = a->diag;
2165   MatScalar       *aa=a->a;
2166   Scalar          *x,*b;
2167   Scalar          s1,x1;
2168   MatScalar       *v;
2169   int             jdx,idt,idx,nz,*vi,i;
2170 
2171   PetscFunctionBegin;
2172   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2173   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2174 
2175   /* forward solve the lower triangular */
2176   idx    = 0;
2177   x[0]   = b[0];
2178   for ( i=1; i<n; i++ ) {
2179     v     =  aa      + ai[i];
2180     vi    =  aj      + ai[i];
2181     nz    =  diag[i] - ai[i];
2182     idx   +=  1;
2183     s1  =  b[idx];
2184     while (nz--) {
2185       jdx   = *vi++;
2186       x1    = x[jdx];
2187       s1 -= v[0]*x1;
2188       v    += 1;
2189     }
2190     x[idx]   = s1;
2191   }
2192   /* backward solve the upper triangular */
2193   for ( i=n-1; i>=0; i-- ){
2194     v    = aa + diag[i] + 1;
2195     vi   = aj + diag[i] + 1;
2196     nz   = ai[i+1] - diag[i] - 1;
2197     idt  = i;
2198     s1 = x[idt];
2199     while (nz--) {
2200       idx   = *vi++;
2201       x1    = x[idx];
2202       s1 -= v[0]*x1;
2203       v    += 1;
2204     }
2205     v        = aa +  diag[i];
2206     x[idt]   = v[0]*s1;
2207   }
2208   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2209   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2210   PLogFlops(2*(a->nz) - a->n);
2211   PetscFunctionReturn(0);
2212 }
2213 
2214 /* ----------------------------------------------------------------*/
2215 /*
2216      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2217    except that the data structure of Mat_SeqAIJ is slightly different.
2218    Not a good example of code reuse.
2219 */
2220 extern int MatMissingDiag_SeqBAIJ(Mat);
2221 
2222 #undef __FUNC__
2223 #define __FUNC__ "MatILUFactorSymbolic_SeqBAIJ"
2224 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2225 {
2226   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
2227   IS          isicol;
2228   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
2229   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
2230   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0, dcount = 0;
2231   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2, levels, diagonal_fill;
2232   PetscTruth  col_identity, row_identity;
2233   double      f;
2234 
2235   PetscFunctionBegin;
2236   if (info) {
2237     f             = info->fill;
2238     levels        = (int) info->levels;
2239     diagonal_fill = (int) info->diagonal_fill;
2240   } else {
2241     f             = 1.0;
2242     levels        = 0;
2243     diagonal_fill = 0;
2244   }
2245   ierr = ISInvertPermutation(iscol,&isicol);CHKERRQ(ierr);
2246 
2247   /* special case that simply copies fill pattern */
2248   PetscValidHeaderSpecific(isrow,IS_COOKIE);
2249   PetscValidHeaderSpecific(iscol,IS_COOKIE);
2250   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2251   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2252   if (levels == 0 && row_identity && col_identity) {
2253     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2254     (*fact)->factor = FACTOR_LU;
2255     b               = (Mat_SeqBAIJ *) (*fact)->data;
2256     if (!b->diag) {
2257       ierr = MatMarkDiag_SeqBAIJ(*fact);CHKERRQ(ierr);
2258     }
2259     ierr = MatMissingDiag_SeqBAIJ(*fact);CHKERRQ(ierr);
2260     b->row        = isrow;
2261     b->col        = iscol;
2262     b->icol       = isicol;
2263     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2264    /*
2265         Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2266         for ILU(0) factorization with natural ordering
2267    */
2268     switch (b->bs) {
2269     case 2:
2270       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2271       (*fact)->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2272       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
2273       break;
2274     case 3:
2275       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2276       (*fact)->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2277       PLogInfo(A,"UMatILUFactorSymbolic_SeqBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
2278       break;
2279     case 4:
2280       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2281       (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2282       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
2283       break;
2284     case 5:
2285       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2286       (*fact)->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
2287       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
2288       break;
2289     case 6:
2290       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2291       (*fact)->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
2292       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
2293       break;
2294     case 7:
2295       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2296       (*fact)->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
2297       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
2298     break;
2299   }
2300     PetscFunctionReturn(0);
2301   }
2302 
2303   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2304   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2305 
2306   /* get new row pointers */
2307   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) );CHKPTRQ(ainew);
2308   ainew[0] = 0;
2309   /* don't know how many column pointers are needed so estimate */
2310   jmax = (int) (f*ai[n] + 1);
2311   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) );CHKPTRQ(ajnew);
2312   /* ajfill is level of fill for each fill entry */
2313   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) );CHKPTRQ(ajfill);
2314   /* fill is a linked list of nonzeros in active row */
2315   fill = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(fill);
2316   /* im is level for each filled value */
2317   im = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(im);
2318   /* dloc is location of diagonal in factor */
2319   dloc = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(dloc);
2320   dloc[0]  = 0;
2321   for ( prow=0; prow<n; prow++ ) {
2322 
2323     /* copy prow into linked list */
2324     nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2325     if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix");
2326     xi         = aj + ai[r[prow]];
2327     fill[n]    = n;
2328     fill[prow] = -1; /* marker for diagonal entry */
2329     while (nz--) {
2330       fm  = n;
2331       idx = ic[*xi++];
2332       do {
2333         m  = fm;
2334         fm = fill[m];
2335       } while (fm < idx);
2336       fill[m]   = idx;
2337       fill[idx] = fm;
2338       im[idx]   = 0;
2339     }
2340 
2341     /* make sure diagonal entry is included */
2342     if (diagonal_fill && fill[prow] == -1) {
2343       fm = n;
2344       while (fill[fm] < prow) fm = fill[fm];
2345       fill[prow] = fill[fm];  /* insert diagonal into linked list */
2346       fill[fm]   = prow;
2347       im[prow]   = 0;
2348       nzf++;
2349       dcount++;
2350     }
2351 
2352     nzi = 0;
2353     row = fill[n];
2354     while ( row < prow ) {
2355       incrlev = im[row] + 1;
2356       nz      = dloc[row];
2357       xi      = ajnew  + ainew[row] + nz + 1;
2358       flev    = ajfill + ainew[row] + nz + 1;
2359       nnz     = ainew[row+1] - ainew[row] - nz - 1;
2360       fm      = row;
2361       while (nnz-- > 0) {
2362         idx = *xi++;
2363         if (*flev + incrlev > levels) {
2364           flev++;
2365           continue;
2366         }
2367         do {
2368           m  = fm;
2369           fm = fill[m];
2370         } while (fm < idx);
2371         if (fm != idx) {
2372           im[idx]   = *flev + incrlev;
2373           fill[m]   = idx;
2374           fill[idx] = fm;
2375           fm        = idx;
2376           nzf++;
2377         } else {
2378           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2379         }
2380         flev++;
2381       }
2382       row = fill[row];
2383       nzi++;
2384     }
2385     /* copy new filled row into permanent storage */
2386     ainew[prow+1] = ainew[prow] + nzf;
2387     if (ainew[prow+1] > jmax) {
2388 
2389       /* estimate how much additional space we will need */
2390       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2391       /* just double the memory each time */
2392       int maxadd = jmax;
2393       /* maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); */
2394       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2395       jmax += maxadd;
2396 
2397       /* allocate a longer ajnew and ajfill */
2398       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
2399       ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2400       ierr = PetscFree(ajnew);CHKERRQ(ierr);
2401       ajnew = xi;
2402       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
2403       ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2404       ierr = PetscFree(ajfill);CHKERRQ(ierr);
2405       ajfill = xi;
2406       realloc++; /* count how many reallocations are needed */
2407     }
2408     xi          = ajnew + ainew[prow];
2409     flev        = ajfill + ainew[prow];
2410     dloc[prow]  = nzi;
2411     fm          = fill[n];
2412     while (nzf--) {
2413       *xi++   = fm;
2414       *flev++ = im[fm];
2415       fm      = fill[fm];
2416     }
2417     /* make sure row has diagonal entry */
2418     if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2419       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,1,"Row %d has missing diagonal in factored matrix\n\
2420     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2421     }
2422   }
2423   ierr = PetscFree(ajfill);CHKERRQ(ierr);
2424   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2425   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2426   ierr = PetscFree(fill);CHKERRQ(ierr);
2427   ierr = PetscFree(im);CHKERRQ(ierr);
2428 
2429   {
2430     double af = ((double)ainew[n])/((double)ai[n]);
2431     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
2432              realloc,f,af);
2433     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2434     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2435     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2436     if (diagonal_fill) {
2437       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2438     }
2439   }
2440 
2441   /* put together the new matrix */
2442   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2443   PLogObjectParent(*fact,isicol);
2444   b = (Mat_SeqBAIJ *) (*fact)->data;
2445   ierr = PetscFree(b->imax);CHKERRQ(ierr);
2446   b->singlemalloc = 0;
2447   len = bs2*ainew[n]*sizeof(MatScalar);
2448   /* the next line frees the default space generated by the Create() */
2449   ierr = PetscFree(b->a);CHKERRQ(ierr);
2450   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2451   b->a          = (MatScalar *) PetscMalloc( len );CHKPTRQ(b->a);
2452   b->j          = ajnew;
2453   b->i          = ainew;
2454   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
2455   b->diag       = dloc;
2456   b->ilen       = 0;
2457   b->imax       = 0;
2458   b->row        = isrow;
2459   b->col        = iscol;
2460   b->icol       = isicol;
2461   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2462   /* In b structure:  Free imax, ilen, old a, old j.
2463      Allocate dloc, solve_work, new a, new j */
2464   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
2465   b->maxnz          = b->nz = ainew[n];
2466   (*fact)->factor   = FACTOR_LU;
2467 
2468   (*fact)->info.factor_mallocs    = realloc;
2469   (*fact)->info.fill_ratio_given  = f;
2470   (*fact)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[prow]);
2471 
2472   PetscFunctionReturn(0);
2473 }
2474 
2475 
2476 
2477 
2478