xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision c4ac5f5fc01a91c2bb048584e85ba147c7d8a938)
1 /*$Id: baijfact2.c,v 1.35 2000/01/11 21:00:52 bsmith Exp bsmith $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   Scalar          s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*b[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PLogFlops(2*(a->nz) - a->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNC__
55 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60   int             *diag = a->diag,oidx;
61   MatScalar       *aa=a->a,*v;
62   Scalar          s1,s2,x1,x2;
63   Scalar          *x,*b;
64 
65   PetscFunctionBegin;
66   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68 
69   /* forward solve the U^T */
70   idx = 0;
71   for (i=0; i<n; i++) {
72 
73     v     = aa + 4*diag[i];
74     /* multiply by the inverse of the block diagonal */
75     x1 = b[idx];   x2 = b[1+idx];
76     s1 = v[0]*x1  +  v[1]*x2;
77     s2 = v[2]*x1  +  v[3]*x2;
78     v += 4;
79 
80     vi    = aj + diag[i] + 1;
81     nz    = ai[i+1] - diag[i] - 1;
82     while (nz--) {
83       oidx = 2*(*vi++);
84       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86       v  += 4;
87     }
88     x[idx]   = s1;x[1+idx] = s2;
89     idx += 2;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--){
93     v    = aa + 4*diag[i] - 4;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     idt  = 2*i;
97     s1   = x[idt];  s2 = x[1+idt];
98     while (nz--) {
99       idx   = 2*(*vi--);
100       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102       v -= 4;
103     }
104   }
105   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107   PLogFlops(2*4*(a->nz) - 2*a->n);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNC__
112 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114 {
115   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117   int             *diag = a->diag,oidx;
118   MatScalar       *aa=a->a,*v;
119   Scalar          s1,s2,s3,x1,x2,x3;
120   Scalar          *x,*b;
121 
122   PetscFunctionBegin;
123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125 
126   /* forward solve the U^T */
127   idx = 0;
128   for (i=0; i<n; i++) {
129 
130     v     = aa + 9*diag[i];
131     /* multiply by the inverse of the block diagonal */
132     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
133     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
134     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
135     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
136     v += 9;
137 
138     vi    = aj + diag[i] + 1;
139     nz    = ai[i+1] - diag[i] - 1;
140     while (nz--) {
141       oidx = 3*(*vi++);
142       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
143       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
144       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
145       v  += 9;
146     }
147     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
148     idx += 3;
149   }
150   /* backward solve the L^T */
151   for (i=n-1; i>=0; i--){
152     v    = aa + 9*diag[i] - 9;
153     vi   = aj + diag[i] - 1;
154     nz   = diag[i] - ai[i];
155     idt  = 3*i;
156     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
157     while (nz--) {
158       idx   = 3*(*vi--);
159       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
160       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
161       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
162       v -= 9;
163     }
164   }
165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
167   PLogFlops(2*9*(a->nz) - 3*a->n);
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNC__
172 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
173 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
176   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
177   int             *diag = a->diag,oidx;
178   MatScalar       *aa=a->a,*v;
179   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
180   Scalar          *x,*b;
181 
182   PetscFunctionBegin;
183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185 
186   /* forward solve the U^T */
187   idx = 0;
188   for (i=0; i<n; i++) {
189 
190     v     = aa + 16*diag[i];
191     /* multiply by the inverse of the block diagonal */
192     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
193     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
194     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
195     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
196     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
197     v += 16;
198 
199     vi    = aj + diag[i] + 1;
200     nz    = ai[i+1] - diag[i] - 1;
201     while (nz--) {
202       oidx = 4*(*vi++);
203       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
204       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
205       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
206       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
207       v  += 16;
208     }
209     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
210     idx += 4;
211   }
212   /* backward solve the L^T */
213   for (i=n-1; i>=0; i--){
214     v    = aa + 16*diag[i] - 16;
215     vi   = aj + diag[i] - 1;
216     nz   = diag[i] - ai[i];
217     idt  = 4*i;
218     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
219     while (nz--) {
220       idx   = 4*(*vi--);
221       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
222       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
223       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
224       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
225       v -= 16;
226     }
227   }
228   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
229   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
230   PLogFlops(2*16*(a->nz) - 4*a->n);
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNC__
235 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
236 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
237 {
238   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
239   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
240   int             *diag = a->diag,oidx;
241   MatScalar       *aa=a->a,*v;
242   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
243   Scalar          *x,*b;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248 
249   /* forward solve the U^T */
250   idx = 0;
251   for (i=0; i<n; i++) {
252 
253     v     = aa + 25*diag[i];
254     /* multiply by the inverse of the block diagonal */
255     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
256     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
257     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
258     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
259     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
260     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
261     v += 25;
262 
263     vi    = aj + diag[i] + 1;
264     nz    = ai[i+1] - diag[i] - 1;
265     while (nz--) {
266       oidx = 5*(*vi++);
267       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
268       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
269       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
270       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
271       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
272       v  += 25;
273     }
274     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
275     idx += 5;
276   }
277   /* backward solve the L^T */
278   for (i=n-1; i>=0; i--){
279     v    = aa + 25*diag[i] - 25;
280     vi   = aj + diag[i] - 1;
281     nz   = diag[i] - ai[i];
282     idt  = 5*i;
283     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
284     while (nz--) {
285       idx   = 5*(*vi--);
286       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
287       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
288       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
289       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
290       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
291       v -= 25;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   PLogFlops(2*25*(a->nz) - 5*a->n);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNC__
301 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
302 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
305   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306   int             *diag = a->diag,oidx;
307   MatScalar       *aa=a->a,*v;
308   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
309   Scalar          *x,*b;
310 
311   PetscFunctionBegin;
312   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
313   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
314 
315   /* forward solve the U^T */
316   idx = 0;
317   for (i=0; i<n; i++) {
318 
319     v     = aa + 36*diag[i];
320     /* multiply by the inverse of the block diagonal */
321     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
322     x6    = b[5+idx];
323     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
324     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
325     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
326     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
327     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
328     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
329     v += 36;
330 
331     vi    = aj + diag[i] + 1;
332     nz    = ai[i+1] - diag[i] - 1;
333     while (nz--) {
334       oidx = 6*(*vi++);
335       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
336       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
337       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
338       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
339       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
340       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
341       v  += 36;
342     }
343     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
344     x[5+idx] = s6;
345     idx += 6;
346   }
347   /* backward solve the L^T */
348   for (i=n-1; i>=0; i--){
349     v    = aa + 36*diag[i] - 36;
350     vi   = aj + diag[i] - 1;
351     nz   = diag[i] - ai[i];
352     idt  = 6*i;
353     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
354     s6 = x[5+idt];
355     while (nz--) {
356       idx   = 6*(*vi--);
357       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
358       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
359       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
360       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
361       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
362       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
363       v -= 36;
364     }
365   }
366   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
368   PLogFlops(2*36*(a->nz) - 6*a->n);
369   PetscFunctionReturn(0);
370 }
371 
372 #undef __FUNC__
373 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
374 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
375 {
376   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
377   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378   int             *diag = a->diag,oidx;
379   MatScalar       *aa=a->a,*v;
380   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
381   Scalar          *x,*b;
382 
383   PetscFunctionBegin;
384   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386 
387   /* forward solve the U^T */
388   idx = 0;
389   for (i=0; i<n; i++) {
390 
391     v     = aa + 49*diag[i];
392     /* multiply by the inverse of the block diagonal */
393     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
394     x6    = b[5+idx]; x7 = b[6+idx];
395     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
396     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
397     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
398     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
399     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
400     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
401     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
402     v += 49;
403 
404     vi    = aj + diag[i] + 1;
405     nz    = ai[i+1] - diag[i] - 1;
406     while (nz--) {
407       oidx = 7*(*vi++);
408       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
409       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
410       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
411       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
412       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
413       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
414       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
415       v  += 49;
416     }
417     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
418     x[5+idx] = s6;x[6+idx] = s7;
419     idx += 7;
420   }
421   /* backward solve the L^T */
422   for (i=n-1; i>=0; i--){
423     v    = aa + 49*diag[i] - 49;
424     vi   = aj + diag[i] - 1;
425     nz   = diag[i] - ai[i];
426     idt  = 7*i;
427     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
428     s6 = x[5+idt];s7 = x[6+idt];
429     while (nz--) {
430       idx   = 7*(*vi--);
431       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
432       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
433       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
434       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
435       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
436       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
437       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
438       v -= 49;
439     }
440   }
441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443   PLogFlops(2*49*(a->nz) - 7*a->n);
444   PetscFunctionReturn(0);
445 }
446 
447 /*---------------------------------------------------------------------------------------------*/
448 #undef __FUNC__
449 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_1"
450 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453   IS              iscol=a->col,isrow=a->row;
454   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455   int             *diag = a->diag;
456   MatScalar       *aa=a->a,*v;
457   Scalar          s1,*x,*b,*t;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   t  = a->solve_work;
463 
464   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466 
467   /* copy the b into temp work space according to permutation */
468   for (i=0; i<n; i++) {
469     t[i] = b[c[i]];
470   }
471 
472   /* forward solve the U^T */
473   for (i=0; i<n; i++) {
474 
475     v     = aa + diag[i];
476     /* multiply by the inverse of the block diagonal */
477     s1    = (*v++)*t[i];
478     vi    = aj + diag[i] + 1;
479     nz    = ai[i+1] - diag[i] - 1;
480     while (nz--) {
481       t[*vi++]  -= (*v++)*s1;
482     }
483     t[i]   = s1;
484   }
485   /* backward solve the L^T */
486   for (i=n-1; i>=0; i--){
487     v    = aa + diag[i] - 1;
488     vi   = aj + diag[i] - 1;
489     nz   = diag[i] - ai[i];
490     s1   = t[i];
491     while (nz--) {
492       t[*vi--]   -=  (*v--)*s1;
493     }
494   }
495 
496   /* copy t into x according to permutation */
497   for (i=0; i<n; i++) {
498     x[r[i]]   = t[i];
499   }
500 
501   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505   PLogFlops(2*(a->nz) - a->n);
506   PetscFunctionReturn(0);
507 }
508 
509 #undef __FUNC__
510 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_2"
511 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
512 {
513   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514   IS              iscol=a->col,isrow=a->row;
515   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516   int             *diag = a->diag,ii,ic,ir,oidx;
517   MatScalar       *aa=a->a,*v;
518   Scalar          s1,s2,x1,x2;
519   Scalar          *x,*b,*t;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   t  = a->solve_work;
525 
526   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528 
529   /* copy the b into temp work space according to permutation */
530   ii = 0;
531   for (i=0; i<n; i++) {
532     ic      = 2*c[i];
533     t[ii]   = b[ic];
534     t[ii+1] = b[ic+1];
535     ii += 2;
536   }
537 
538   /* forward solve the U^T */
539   idx = 0;
540   for (i=0; i<n; i++) {
541 
542     v     = aa + 4*diag[i];
543     /* multiply by the inverse of the block diagonal */
544     x1    = t[idx];   x2 = t[1+idx];
545     s1 = v[0]*x1  +  v[1]*x2;
546     s2 = v[2]*x1  +  v[3]*x2;
547     v += 4;
548 
549     vi    = aj + diag[i] + 1;
550     nz    = ai[i+1] - diag[i] - 1;
551     while (nz--) {
552       oidx = 2*(*vi++);
553       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555       v  += 4;
556     }
557     t[idx]   = s1;t[1+idx] = s2;
558     idx += 2;
559   }
560   /* backward solve the L^T */
561   for (i=n-1; i>=0; i--){
562     v    = aa + 4*diag[i] - 4;
563     vi   = aj + diag[i] - 1;
564     nz   = diag[i] - ai[i];
565     idt  = 2*i;
566     s1 = t[idt];  s2 = t[1+idt];
567     while (nz--) {
568       idx   = 2*(*vi--);
569       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571       v -= 4;
572     }
573   }
574 
575   /* copy t into x according to permutation */
576   ii = 0;
577   for (i=0; i<n; i++) {
578     ir      = 2*r[i];
579     x[ir]   = t[ii];
580     x[ir+1] = t[ii+1];
581     ii += 2;
582   }
583 
584   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PLogFlops(2*4*(a->nz) - 2*a->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNC__
593 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_3"
594 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599   int             *diag = a->diag,ii,ic,ir,oidx;
600   MatScalar       *aa=a->a,*v;
601   Scalar          s1,s2,s3,x1,x2,x3;
602   Scalar          *x,*b,*t;
603 
604   PetscFunctionBegin;
605   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607   t  = a->solve_work;
608 
609   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611 
612   /* copy the b into temp work space according to permutation */
613   ii = 0;
614   for (i=0; i<n; i++) {
615     ic      = 3*c[i];
616     t[ii]   = b[ic];
617     t[ii+1] = b[ic+1];
618     t[ii+2] = b[ic+2];
619     ii += 3;
620   }
621 
622   /* forward solve the U^T */
623   idx = 0;
624   for (i=0; i<n; i++) {
625 
626     v     = aa + 9*diag[i];
627     /* multiply by the inverse of the block diagonal */
628     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632     v += 9;
633 
634     vi    = aj + diag[i] + 1;
635     nz    = ai[i+1] - diag[i] - 1;
636     while (nz--) {
637       oidx = 3*(*vi++);
638       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641       v  += 9;
642     }
643     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644     idx += 3;
645   }
646   /* backward solve the L^T */
647   for (i=n-1; i>=0; i--){
648     v    = aa + 9*diag[i] - 9;
649     vi   = aj + diag[i] - 1;
650     nz   = diag[i] - ai[i];
651     idt  = 3*i;
652     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653     while (nz--) {
654       idx   = 3*(*vi--);
655       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v -= 9;
659     }
660   }
661 
662   /* copy t into x according to permutation */
663   ii = 0;
664   for (i=0; i<n; i++) {
665     ir      = 3*r[i];
666     x[ir]   = t[ii];
667     x[ir+1] = t[ii+1];
668     x[ir+2] = t[ii+2];
669     ii += 3;
670   }
671 
672   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   PLogFlops(2*9*(a->nz) - 3*a->n);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNC__
681 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_4"
682 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685   IS              iscol=a->col,isrow=a->row;
686   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687   int             *diag = a->diag,ii,ic,ir,oidx;
688   MatScalar       *aa=a->a,*v;
689   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
690   Scalar          *x,*b,*t;
691 
692   PetscFunctionBegin;
693   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695   t  = a->solve_work;
696 
697   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699 
700   /* copy the b into temp work space according to permutation */
701   ii = 0;
702   for (i=0; i<n; i++) {
703     ic      = 4*c[i];
704     t[ii]   = b[ic];
705     t[ii+1] = b[ic+1];
706     t[ii+2] = b[ic+2];
707     t[ii+3] = b[ic+3];
708     ii += 4;
709   }
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for (i=0; i<n; i++) {
714 
715     v     = aa + 16*diag[i];
716     /* multiply by the inverse of the block diagonal */
717     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722     v += 16;
723 
724     vi    = aj + diag[i] + 1;
725     nz    = ai[i+1] - diag[i] - 1;
726     while (nz--) {
727       oidx = 4*(*vi++);
728       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732       v  += 16;
733     }
734     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735     idx += 4;
736   }
737   /* backward solve the L^T */
738   for (i=n-1; i>=0; i--){
739     v    = aa + 16*diag[i] - 16;
740     vi   = aj + diag[i] - 1;
741     nz   = diag[i] - ai[i];
742     idt  = 4*i;
743     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744     while (nz--) {
745       idx   = 4*(*vi--);
746       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v -= 16;
751     }
752   }
753 
754   /* copy t into x according to permutation */
755   ii = 0;
756   for (i=0; i<n; i++) {
757     ir      = 4*r[i];
758     x[ir]   = t[ii];
759     x[ir+1] = t[ii+1];
760     x[ir+2] = t[ii+2];
761     x[ir+3] = t[ii+3];
762     ii += 4;
763   }
764 
765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   PLogFlops(2*16*(a->nz) - 4*a->n);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNC__
774 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_5"
775 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778   IS              iscol=a->col,isrow=a->row;
779   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780   int             *diag = a->diag,ii,ic,ir,oidx;
781   MatScalar       *aa=a->a,*v;
782   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783   Scalar          *x,*b,*t;
784 
785   PetscFunctionBegin;
786   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788   t  = a->solve_work;
789 
790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792 
793   /* copy the b into temp work space according to permutation */
794   ii = 0;
795   for (i=0; i<n; i++) {
796     ic      = 5*c[i];
797     t[ii]   = b[ic];
798     t[ii+1] = b[ic+1];
799     t[ii+2] = b[ic+2];
800     t[ii+3] = b[ic+3];
801     t[ii+4] = b[ic+4];
802     ii += 5;
803   }
804 
805   /* forward solve the U^T */
806   idx = 0;
807   for (i=0; i<n; i++) {
808 
809     v     = aa + 25*diag[i];
810     /* multiply by the inverse of the block diagonal */
811     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817     v += 25;
818 
819     vi    = aj + diag[i] + 1;
820     nz    = ai[i+1] - diag[i] - 1;
821     while (nz--) {
822       oidx = 5*(*vi++);
823       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828       v  += 25;
829     }
830     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831     idx += 5;
832   }
833   /* backward solve the L^T */
834   for (i=n-1; i>=0; i--){
835     v    = aa + 25*diag[i] - 25;
836     vi   = aj + diag[i] - 1;
837     nz   = diag[i] - ai[i];
838     idt  = 5*i;
839     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840     while (nz--) {
841       idx   = 5*(*vi--);
842       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v -= 25;
848     }
849   }
850 
851   /* copy t into x according to permutation */
852   ii = 0;
853   for (i=0; i<n; i++) {
854     ir      = 5*r[i];
855     x[ir]   = t[ii];
856     x[ir+1] = t[ii+1];
857     x[ir+2] = t[ii+2];
858     x[ir+3] = t[ii+3];
859     x[ir+4] = t[ii+4];
860     ii += 5;
861   }
862 
863   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867   PLogFlops(2*25*(a->nz) - 5*a->n);
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNC__
872 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_6"
873 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
874 {
875   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876   IS              iscol=a->col,isrow=a->row;
877   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878   int             *diag = a->diag,ii,ic,ir,oidx;
879   MatScalar       *aa=a->a,*v;
880   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881   Scalar          *x,*b,*t;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886   t  = a->solve_work;
887 
888   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890 
891   /* copy the b into temp work space according to permutation */
892   ii = 0;
893   for (i=0; i<n; i++) {
894     ic      = 6*c[i];
895     t[ii]   = b[ic];
896     t[ii+1] = b[ic+1];
897     t[ii+2] = b[ic+2];
898     t[ii+3] = b[ic+3];
899     t[ii+4] = b[ic+4];
900     t[ii+5] = b[ic+5];
901     ii += 6;
902   }
903 
904   /* forward solve the U^T */
905   idx = 0;
906   for (i=0; i<n; i++) {
907 
908     v     = aa + 36*diag[i];
909     /* multiply by the inverse of the block diagonal */
910     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911     x6    = t[5+idx];
912     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918     v += 36;
919 
920     vi    = aj + diag[i] + 1;
921     nz    = ai[i+1] - diag[i] - 1;
922     while (nz--) {
923       oidx = 6*(*vi++);
924       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930       v  += 36;
931     }
932     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933     t[5+idx] = s6;
934     idx += 6;
935   }
936   /* backward solve the L^T */
937   for (i=n-1; i>=0; i--){
938     v    = aa + 36*diag[i] - 36;
939     vi   = aj + diag[i] - 1;
940     nz   = diag[i] - ai[i];
941     idt  = 6*i;
942     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943     s6 = t[5+idt];
944     while (nz--) {
945       idx   = 6*(*vi--);
946       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952       v -= 36;
953     }
954   }
955 
956   /* copy t into x according to permutation */
957   ii = 0;
958   for (i=0; i<n; i++) {
959     ir      = 6*r[i];
960     x[ir]   = t[ii];
961     x[ir+1] = t[ii+1];
962     x[ir+2] = t[ii+2];
963     x[ir+3] = t[ii+3];
964     x[ir+4] = t[ii+4];
965     x[ir+5] = t[ii+5];
966     ii += 6;
967   }
968 
969   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973   PLogFlops(2*36*(a->nz) - 6*a->n);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNC__
978 #define __FUNC__ "MatSolveTranspose_SeqBAIJ_7"
979 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982   IS              iscol=a->col,isrow=a->row;
983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984   int             *diag = a->diag,ii,ic,ir,oidx;
985   MatScalar       *aa=a->a,*v;
986   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987   Scalar          *x,*b,*t;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   t  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996 
997   /* copy the b into temp work space according to permutation */
998   ii = 0;
999   for (i=0; i<n; i++) {
1000     ic      = 7*c[i];
1001     t[ii]   = b[ic];
1002     t[ii+1] = b[ic+1];
1003     t[ii+2] = b[ic+2];
1004     t[ii+3] = b[ic+3];
1005     t[ii+4] = b[ic+4];
1006     t[ii+5] = b[ic+5];
1007     t[ii+6] = b[ic+6];
1008     ii += 7;
1009   }
1010 
1011   /* forward solve the U^T */
1012   idx = 0;
1013   for (i=0; i<n; i++) {
1014 
1015     v     = aa + 49*diag[i];
1016     /* multiply by the inverse of the block diagonal */
1017     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018     x6    = t[5+idx]; x7 = t[6+idx];
1019     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026     v += 49;
1027 
1028     vi    = aj + diag[i] + 1;
1029     nz    = ai[i+1] - diag[i] - 1;
1030     while (nz--) {
1031       oidx = 7*(*vi++);
1032       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039       v  += 49;
1040     }
1041     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042     t[5+idx] = s6;t[6+idx] = s7;
1043     idx += 7;
1044   }
1045   /* backward solve the L^T */
1046   for (i=n-1; i>=0; i--){
1047     v    = aa + 49*diag[i] - 49;
1048     vi   = aj + diag[i] - 1;
1049     nz   = diag[i] - ai[i];
1050     idt  = 7*i;
1051     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052     s6 = t[5+idt];s7 = t[6+idt];
1053     while (nz--) {
1054       idx   = 7*(*vi--);
1055       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062       v -= 49;
1063     }
1064   }
1065 
1066   /* copy t into x according to permutation */
1067   ii = 0;
1068   for (i=0; i<n; i++) {
1069     ir      = 7*r[i];
1070     x[ir]   = t[ii];
1071     x[ir+1] = t[ii+1];
1072     x[ir+2] = t[ii+2];
1073     x[ir+3] = t[ii+3];
1074     x[ir+4] = t[ii+4];
1075     x[ir+5] = t[ii+5];
1076     x[ir+6] = t[ii+6];
1077     ii += 7;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084   PLogFlops(2*49*(a->nz) - 7*a->n);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* ----------------------------------------------------------- */
1089 #undef __FUNC__
1090 #define __FUNC__ "MatSolve_SeqBAIJ_N"
1091 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1092 {
1093   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1094   IS              iscol=a->col,isrow=a->row;
1095   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1096   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1097   MatScalar       *aa=a->a,*v;
1098   Scalar          *x,*b,*s,*t,*ls;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103   t  = a->solve_work;
1104 
1105   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1106   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1107 
1108   /* forward solve the lower triangular */
1109   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1110   for (i=1; i<n; i++) {
1111     v   = aa + bs2*ai[i];
1112     vi  = aj + ai[i];
1113     nz  = a->diag[i] - ai[i];
1114     s = t + bs*i;
1115     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1116     while (nz--) {
1117       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1118       v += bs2;
1119     }
1120   }
1121   /* backward solve the upper triangular */
1122   ls = a->solve_work + a->n;
1123   for (i=n-1; i>=0; i--){
1124     v   = aa + bs2*(a->diag[i] + 1);
1125     vi  = aj + a->diag[i] + 1;
1126     nz  = ai[i+1] - a->diag[i] - 1;
1127     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1137   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140   PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNC__
1145 #define __FUNC__ "MatSolve_SeqBAIJ_7"
1146 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1149   IS              iscol=a->col,isrow=a->row;
1150   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1151   int             *diag = a->diag;
1152   MatScalar       *aa=a->a,*v;
1153   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154   Scalar          *x,*b,*t;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   t  = a->solve_work;
1160 
1161   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1162   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1163 
1164   /* forward solve the lower triangular */
1165   idx    = 7*(*r++);
1166   t[0] = b[idx];   t[1] = b[1+idx];
1167   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168   t[5] = b[5+idx]; t[6] = b[6+idx];
1169 
1170   for (i=1; i<n; i++) {
1171     v     = aa + 49*ai[i];
1172     vi    = aj + ai[i];
1173     nz    = diag[i] - ai[i];
1174     idx   = 7*(*r++);
1175     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1177     while (nz--) {
1178       idx   = 7*(*vi++);
1179       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180       x4    = t[3+idx];x5 = t[4+idx];
1181       x6    = t[5+idx];x7 = t[6+idx];
1182       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1189       v += 49;
1190     }
1191     idx = 7*i;
1192     t[idx]   = s1;t[1+idx] = s2;
1193     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194     t[5+idx] = s6;t[6+idx] = s7;
1195   }
1196   /* backward solve the upper triangular */
1197   for (i=n-1; i>=0; i--){
1198     v    = aa + 49*diag[i] + 49;
1199     vi   = aj + diag[i] + 1;
1200     nz   = ai[i+1] - diag[i] - 1;
1201     idt  = 7*i;
1202     s1 = t[idt];  s2 = t[1+idt];
1203     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204     s6 = t[5+idt];s7 = t[6+idt];
1205     while (nz--) {
1206       idx   = 7*(*vi++);
1207       x1    = t[idx];   x2 = t[1+idx];
1208       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209       x6    = t[5+idx]; x7 = t[6+idx];
1210       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1217       v += 49;
1218     }
1219     idc = 7*(*c--);
1220     v   = aa + 49*diag[i];
1221     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1235   }
1236 
1237   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1238   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241   PLogFlops(2*49*(a->nz) - 7*a->n);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNC__
1246 #define __FUNC__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1247 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1248 {
1249   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1250   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1251   int             ierr,*diag = a->diag,jdx;
1252   MatScalar       *aa=a->a,*v;
1253   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1258   /* forward solve the lower triangular */
1259   idx    = 0;
1260   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1261   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1262   x[6] = b[6+idx];
1263   for (i=1; i<n; i++) {
1264     v     =  aa + 49*ai[i];
1265     vi    =  aj + ai[i];
1266     nz    =  diag[i] - ai[i];
1267     idx   =  7*i;
1268     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270     s7  =  b[6+idx];
1271     while (nz--) {
1272       jdx   = 7*(*vi++);
1273       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1274       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1275       x7    = x[6+jdx];
1276       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1283       v += 49;
1284      }
1285     x[idx]   = s1;
1286     x[1+idx] = s2;
1287     x[2+idx] = s3;
1288     x[3+idx] = s4;
1289     x[4+idx] = s5;
1290     x[5+idx] = s6;
1291     x[6+idx] = s7;
1292   }
1293   /* backward solve the upper triangular */
1294   for (i=n-1; i>=0; i--){
1295     v    = aa + 49*diag[i] + 49;
1296     vi   = aj + diag[i] + 1;
1297     nz   = ai[i+1] - diag[i] - 1;
1298     idt  = 7*i;
1299     s1 = x[idt];   s2 = x[1+idt];
1300     s3 = x[2+idt]; s4 = x[3+idt];
1301     s5 = x[4+idt]; s6 = x[5+idt];
1302     s7 = x[6+idt];
1303     while (nz--) {
1304       idx   = 7*(*vi++);
1305       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1306       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1307       x7    = x[6+idx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316     }
1317     v        = aa + 49*diag[i];
1318     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1332   }
1333 
1334   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   PLogFlops(2*36*(a->nz) - 6*a->n);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNC__
1341 #define __FUNC__ "MatSolve_SeqBAIJ_6"
1342 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1343 {
1344   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1345   IS              iscol=a->col,isrow=a->row;
1346   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1347   int             *diag = a->diag;
1348   MatScalar       *aa=a->a,*v;
1349   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1358 
1359   /* forward solve the lower triangular */
1360   idx    = 6*(*r++);
1361   t[0] = b[idx];   t[1] = b[1+idx];
1362   t[2] = b[2+idx]; t[3] = b[3+idx];
1363   t[4] = b[4+idx]; t[5] = b[5+idx];
1364   for (i=1; i<n; i++) {
1365     v     = aa + 36*ai[i];
1366     vi    = aj + ai[i];
1367     nz    = diag[i] - ai[i];
1368     idx   = 6*(*r++);
1369     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370     s5  = b[4+idx]; s6 = b[5+idx];
1371     while (nz--) {
1372       idx   = 6*(*vi++);
1373       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1381       v += 36;
1382     }
1383     idx = 6*i;
1384     t[idx]   = s1;t[1+idx] = s2;
1385     t[2+idx] = s3;t[3+idx] = s4;
1386     t[4+idx] = s5;t[5+idx] = s6;
1387   }
1388   /* backward solve the upper triangular */
1389   for (i=n-1; i>=0; i--){
1390     v    = aa + 36*diag[i] + 36;
1391     vi   = aj + diag[i] + 1;
1392     nz   = ai[i+1] - diag[i] - 1;
1393     idt  = 6*i;
1394     s1 = t[idt];  s2 = t[1+idt];
1395     s3 = t[2+idt];s4 = t[3+idt];
1396     s5 = t[4+idt];s6 = t[5+idt];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx];
1400       x3    = t[2+idx]; x4 = t[3+idx];
1401       x5    = t[4+idx]; x6 = t[5+idx];
1402       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1408       v += 36;
1409     }
1410     idc = 6*(*c--);
1411     v   = aa + 36*diag[i];
1412     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423                                  v[23]*s4+v[29]*s5+v[35]*s6;
1424   }
1425 
1426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1428   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430   PLogFlops(2*36*(a->nz) - 6*a->n);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNC__
1435 #define __FUNC__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1436 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1437 {
1438   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1439   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1440   int             ierr,*diag = a->diag,jdx;
1441   MatScalar       *aa=a->a,*v;
1442   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   /* forward solve the lower triangular */
1448   idx    = 0;
1449   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1450   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1451   for (i=1; i<n; i++) {
1452     v     =  aa + 36*ai[i];
1453     vi    =  aj + ai[i];
1454     nz    =  diag[i] - ai[i];
1455     idx   =  6*i;
1456     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1458     while (nz--) {
1459       jdx   = 6*(*vi++);
1460       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1461       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1468       v += 36;
1469      }
1470     x[idx]   = s1;
1471     x[1+idx] = s2;
1472     x[2+idx] = s3;
1473     x[3+idx] = s4;
1474     x[4+idx] = s5;
1475     x[5+idx] = s6;
1476   }
1477   /* backward solve the upper triangular */
1478   for (i=n-1; i>=0; i--){
1479     v    = aa + 36*diag[i] + 36;
1480     vi   = aj + diag[i] + 1;
1481     nz   = ai[i+1] - diag[i] - 1;
1482     idt  = 6*i;
1483     s1 = x[idt];   s2 = x[1+idt];
1484     s3 = x[2+idt]; s4 = x[3+idt];
1485     s5 = x[4+idt]; s6 = x[5+idt];
1486     while (nz--) {
1487       idx   = 6*(*vi++);
1488       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1489       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497     }
1498     v        = aa + 36*diag[i];
1499     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1505   }
1506 
1507   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   PLogFlops(2*36*(a->nz) - 6*a->n);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNC__
1514 #define __FUNC__ "MatSolve_SeqBAIJ_5"
1515 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1518   IS              iscol=a->col,isrow=a->row;
1519   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1520   int             *diag = a->diag;
1521   MatScalar       *aa=a->a,*v;
1522   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1523 
1524   PetscFunctionBegin;
1525   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527   t  = a->solve_work;
1528 
1529   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1530   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1531 
1532   /* forward solve the lower triangular */
1533   idx    = 5*(*r++);
1534   t[0] = b[idx];   t[1] = b[1+idx];
1535   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1536   for (i=1; i<n; i++) {
1537     v     = aa + 25*ai[i];
1538     vi    = aj + ai[i];
1539     nz    = diag[i] - ai[i];
1540     idx   = 5*(*r++);
1541     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542     s5  = b[4+idx];
1543     while (nz--) {
1544       idx   = 5*(*vi++);
1545       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546       x4    = t[3+idx];x5 = t[4+idx];
1547       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1552       v += 25;
1553     }
1554     idx = 5*i;
1555     t[idx]   = s1;t[1+idx] = s2;
1556     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1557   }
1558   /* backward solve the upper triangular */
1559   for (i=n-1; i>=0; i--){
1560     v    = aa + 25*diag[i] + 25;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 5*i;
1564     s1 = t[idt];  s2 = t[1+idt];
1565     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1566     while (nz--) {
1567       idx   = 5*(*vi++);
1568       x1    = t[idx];   x2 = t[1+idx];
1569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1575       v += 25;
1576     }
1577     idc = 5*(*c--);
1578     v   = aa + 25*diag[i];
1579     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580                                  v[15]*s4+v[20]*s5;
1581     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582                                  v[16]*s4+v[21]*s5;
1583     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584                                  v[17]*s4+v[22]*s5;
1585     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586                                  v[18]*s4+v[23]*s5;
1587     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588                                  v[19]*s4+v[24]*s5;
1589   }
1590 
1591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   PLogFlops(2*25*(a->nz) - 5*a->n);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNC__
1600 #define __FUNC__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1601 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1604   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1605   int             ierr,*diag = a->diag,jdx;
1606   MatScalar       *aa=a->a,*v;
1607   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1612   /* forward solve the lower triangular */
1613   idx    = 0;
1614   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1615   for (i=1; i<n; i++) {
1616     v     =  aa + 25*ai[i];
1617     vi    =  aj + ai[i];
1618     nz    =  diag[i] - ai[i];
1619     idx   =  5*i;
1620     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1621     while (nz--) {
1622       jdx   = 5*(*vi++);
1623       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1629       v    += 25;
1630     }
1631     x[idx]   = s1;
1632     x[1+idx] = s2;
1633     x[2+idx] = s3;
1634     x[3+idx] = s4;
1635     x[4+idx] = s5;
1636   }
1637   /* backward solve the upper triangular */
1638   for (i=n-1; i>=0; i--){
1639     v    = aa + 25*diag[i] + 25;
1640     vi   = aj + diag[i] + 1;
1641     nz   = ai[i+1] - diag[i] - 1;
1642     idt  = 5*i;
1643     s1 = x[idt];  s2 = x[1+idt];
1644     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1645     while (nz--) {
1646       idx   = 5*(*vi++);
1647       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1653       v    += 25;
1654     }
1655     v        = aa + 25*diag[i];
1656     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1661   }
1662 
1663   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665   PLogFlops(2*25*(a->nz) - 5*a->n);
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 #undef __FUNC__
1670 #define __FUNC__ "MatSolve_SeqBAIJ_4"
1671 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1672 {
1673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1674   IS              iscol=a->col,isrow=a->row;
1675   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1676   int             *diag = a->diag;
1677   MatScalar       *aa=a->a,*v;
1678   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1679 
1680   PetscFunctionBegin;
1681   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683   t  = a->solve_work;
1684 
1685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1687 
1688   /* forward solve the lower triangular */
1689   idx    = 4*(*r++);
1690   t[0] = b[idx];   t[1] = b[1+idx];
1691   t[2] = b[2+idx]; t[3] = b[3+idx];
1692   for (i=1; i<n; i++) {
1693     v     = aa + 16*ai[i];
1694     vi    = aj + ai[i];
1695     nz    = diag[i] - ai[i];
1696     idx   = 4*(*r++);
1697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1698     while (nz--) {
1699       idx   = 4*(*vi++);
1700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1705       v    += 16;
1706     }
1707     idx        = 4*i;
1708     t[idx]   = s1;t[1+idx] = s2;
1709     t[2+idx] = s3;t[3+idx] = s4;
1710   }
1711   /* backward solve the upper triangular */
1712   for (i=n-1; i>=0; i--){
1713     v    = aa + 16*diag[i] + 16;
1714     vi   = aj + diag[i] + 1;
1715     nz   = ai[i+1] - diag[i] - 1;
1716     idt  = 4*i;
1717     s1 = t[idt];  s2 = t[1+idt];
1718     s3 = t[2+idt];s4 = t[3+idt];
1719     while (nz--) {
1720       idx   = 4*(*vi++);
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1727       v += 16;
1728     }
1729     idc      = 4*(*c--);
1730     v        = aa + 16*diag[i];
1731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   PLogFlops(2*16*(a->nz) - 4*a->n);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 
1746 /*
1747       Special case where the matrix was ILU(0) factored in the natural
1748    ordering. This eliminates the need for the column and row permutation.
1749 */
1750 #undef __FUNC__
1751 #define __FUNC__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
1752 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1753 {
1754   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1755   int             n=a->mbs,*ai=a->i,*aj=a->j;
1756   int             ierr,*diag = a->diag;
1757   MatScalar       *aa=a->a;
1758   Scalar          *x,*b;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763 
1764 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1765   {
1766     static Scalar w[2000]; /* very BAD need to fix */
1767     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1768   }
1769 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1770   {
1771     static Scalar w[2000]; /* very BAD need to fix */
1772     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1773   }
1774 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1775   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1776 #else
1777   {
1778     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1779     MatScalar *v;
1780     int       jdx,idt,idx,nz,*vi,i,ai16;
1781 
1782   /* forward solve the lower triangular */
1783   idx    = 0;
1784   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1785   for (i=1; i<n; i++) {
1786     v     =  aa      + 16*ai[i];
1787     vi    =  aj      + ai[i];
1788     nz    =  diag[i] - ai[i];
1789     idx   +=  4;
1790     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1791     while (nz--) {
1792       jdx   = 4*(*vi++);
1793       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1794       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1795       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1796       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1797       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1798       v    += 16;
1799     }
1800     x[idx]   = s1;
1801     x[1+idx] = s2;
1802     x[2+idx] = s3;
1803     x[3+idx] = s4;
1804   }
1805   /* backward solve the upper triangular */
1806   idt = 4*(n-1);
1807   for (i=n-1; i>=0; i--){
1808     ai16 = 16*diag[i];
1809     v    = aa + ai16 + 16;
1810     vi   = aj + diag[i] + 1;
1811     nz   = ai[i+1] - diag[i] - 1;
1812     s1 = x[idt];  s2 = x[1+idt];
1813     s3 = x[2+idt];s4 = x[3+idt];
1814     while (nz--) {
1815       idx   = 4*(*vi++);
1816       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1817       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1818       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1819       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1820       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1821       v    += 16;
1822     }
1823     v        = aa + ai16;
1824     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1825     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1826     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1827     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1828     idt -= 4;
1829   }
1830   }
1831 #endif
1832 
1833   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1835   PLogFlops(2*16*(a->nz) - 4*a->n);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 #undef __FUNC__
1840 #define __FUNC__ "MatSolve_SeqBAIJ_3"
1841 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1842 {
1843   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1844   IS              iscol=a->col,isrow=a->row;
1845   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1846   int             *diag = a->diag;
1847   MatScalar       *aa=a->a,*v;
1848   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
1849 
1850   PetscFunctionBegin;
1851   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1852   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1853   t  = a->solve_work;
1854 
1855   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1856   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1857 
1858   /* forward solve the lower triangular */
1859   idx    = 3*(*r++);
1860   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1861   for (i=1; i<n; i++) {
1862     v     = aa + 9*ai[i];
1863     vi    = aj + ai[i];
1864     nz    = diag[i] - ai[i];
1865     idx   = 3*(*r++);
1866     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1867     while (nz--) {
1868       idx   = 3*(*vi++);
1869       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1870       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1871       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1872       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1873       v += 9;
1874     }
1875     idx = 3*i;
1876     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1877   }
1878   /* backward solve the upper triangular */
1879   for (i=n-1; i>=0; i--){
1880     v    = aa + 9*diag[i] + 9;
1881     vi   = aj + diag[i] + 1;
1882     nz   = ai[i+1] - diag[i] - 1;
1883     idt  = 3*i;
1884     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1885     while (nz--) {
1886       idx   = 3*(*vi++);
1887       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1888       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1889       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1890       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1891       v += 9;
1892     }
1893     idc = 3*(*c--);
1894     v   = aa + 9*diag[i];
1895     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1896     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1897     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1898   }
1899   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1900   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1901   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1903   PLogFlops(2*9*(a->nz) - 3*a->n);
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 /*
1908       Special case where the matrix was ILU(0) factored in the natural
1909    ordering. This eliminates the need for the column and row permutation.
1910 */
1911 #undef __FUNC__
1912 #define __FUNC__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
1913 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1914 {
1915   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1916   int             n=a->mbs,*ai=a->i,*aj=a->j;
1917   int             ierr,*diag = a->diag;
1918   MatScalar       *aa=a->a,*v;
1919   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
1920   int             jdx,idt,idx,nz,*vi,i;
1921 
1922   PetscFunctionBegin;
1923   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1924   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1925 
1926 
1927   /* forward solve the lower triangular */
1928   idx    = 0;
1929   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
1930   for (i=1; i<n; i++) {
1931     v     =  aa      + 9*ai[i];
1932     vi    =  aj      + ai[i];
1933     nz    =  diag[i] - ai[i];
1934     idx   +=  3;
1935     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1936     while (nz--) {
1937       jdx   = 3*(*vi++);
1938       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1939       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1940       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1941       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1942       v    += 9;
1943     }
1944     x[idx]   = s1;
1945     x[1+idx] = s2;
1946     x[2+idx] = s3;
1947   }
1948   /* backward solve the upper triangular */
1949   for (i=n-1; i>=0; i--){
1950     v    = aa + 9*diag[i] + 9;
1951     vi   = aj + diag[i] + 1;
1952     nz   = ai[i+1] - diag[i] - 1;
1953     idt  = 3*i;
1954     s1 = x[idt];  s2 = x[1+idt];
1955     s3 = x[2+idt];
1956     while (nz--) {
1957       idx   = 3*(*vi++);
1958       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1959       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1960       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1961       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1962       v    += 9;
1963     }
1964     v        = aa +  9*diag[i];
1965     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1966     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1967     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1968   }
1969 
1970   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1972   PLogFlops(2*9*(a->nz) - 3*a->n);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 #undef __FUNC__
1977 #define __FUNC__ "MatSolve_SeqBAIJ_2"
1978 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1979 {
1980   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1981   IS              iscol=a->col,isrow=a->row;
1982   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1983   int             *diag = a->diag;
1984   MatScalar       *aa=a->a,*v;
1985   Scalar          *x,*b,s1,s2,x1,x2,*t;
1986 
1987   PetscFunctionBegin;
1988   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1990   t  = a->solve_work;
1991 
1992   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1993   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1994 
1995   /* forward solve the lower triangular */
1996   idx    = 2*(*r++);
1997   t[0] = b[idx]; t[1] = b[1+idx];
1998   for (i=1; i<n; i++) {
1999     v     = aa + 4*ai[i];
2000     vi    = aj + ai[i];
2001     nz    = diag[i] - ai[i];
2002     idx   = 2*(*r++);
2003     s1  = b[idx]; s2 = b[1+idx];
2004     while (nz--) {
2005       idx   = 2*(*vi++);
2006       x1    = t[idx]; x2 = t[1+idx];
2007       s1 -= v[0]*x1 + v[2]*x2;
2008       s2 -= v[1]*x1 + v[3]*x2;
2009       v += 4;
2010     }
2011     idx = 2*i;
2012     t[idx] = s1; t[1+idx] = s2;
2013   }
2014   /* backward solve the upper triangular */
2015   for (i=n-1; i>=0; i--){
2016     v    = aa + 4*diag[i] + 4;
2017     vi   = aj + diag[i] + 1;
2018     nz   = ai[i+1] - diag[i] - 1;
2019     idt  = 2*i;
2020     s1 = t[idt]; s2 = t[1+idt];
2021     while (nz--) {
2022       idx   = 2*(*vi++);
2023       x1    = t[idx]; x2 = t[1+idx];
2024       s1 -= v[0]*x1 + v[2]*x2;
2025       s2 -= v[1]*x1 + v[3]*x2;
2026       v += 4;
2027     }
2028     idc = 2*(*c--);
2029     v   = aa + 4*diag[i];
2030     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2031     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2032   }
2033   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2034   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2035   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2036   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2037   PLogFlops(2*4*(a->nz) - 2*a->n);
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 /*
2042       Special case where the matrix was ILU(0) factored in the natural
2043    ordering. This eliminates the need for the column and row permutation.
2044 */
2045 #undef __FUNC__
2046 #define __FUNC__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2047 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2048 {
2049   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2050   int             n=a->mbs,*ai=a->i,*aj=a->j;
2051   int             ierr,*diag = a->diag;
2052   MatScalar       *aa=a->a,*v;
2053   Scalar          *x,*b,s1,s2,x1,x2;
2054   int             jdx,idt,idx,nz,*vi,i;
2055 
2056   PetscFunctionBegin;
2057   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2058   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2059 
2060   /* forward solve the lower triangular */
2061   idx    = 0;
2062   x[0]   = b[0]; x[1] = b[1];
2063   for (i=1; i<n; i++) {
2064     v     =  aa      + 4*ai[i];
2065     vi    =  aj      + ai[i];
2066     nz    =  diag[i] - ai[i];
2067     idx   +=  2;
2068     s1  =  b[idx];s2 = b[1+idx];
2069     while (nz--) {
2070       jdx   = 2*(*vi++);
2071       x1    = x[jdx];x2 = x[1+jdx];
2072       s1 -= v[0]*x1 + v[2]*x2;
2073       s2 -= v[1]*x1 + v[3]*x2;
2074       v    += 4;
2075     }
2076     x[idx]   = s1;
2077     x[1+idx] = s2;
2078   }
2079   /* backward solve the upper triangular */
2080   for (i=n-1; i>=0; i--){
2081     v    = aa + 4*diag[i] + 4;
2082     vi   = aj + diag[i] + 1;
2083     nz   = ai[i+1] - diag[i] - 1;
2084     idt  = 2*i;
2085     s1 = x[idt];  s2 = x[1+idt];
2086     while (nz--) {
2087       idx   = 2*(*vi++);
2088       x1    = x[idx];   x2 = x[1+idx];
2089       s1 -= v[0]*x1 + v[2]*x2;
2090       s2 -= v[1]*x1 + v[3]*x2;
2091       v    += 4;
2092     }
2093     v        = aa +  4*diag[i];
2094     x[idt]   = v[0]*s1 + v[2]*s2;
2095     x[1+idt] = v[1]*s1 + v[3]*s2;
2096   }
2097 
2098   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2099   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2100   PLogFlops(2*4*(a->nz) - 2*a->n);
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 #undef __FUNC__
2105 #define __FUNC__ "MatSolve_SeqBAIJ_1"
2106 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2107 {
2108   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2109   IS              iscol=a->col,isrow=a->row;
2110   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2111   int             *diag = a->diag;
2112   MatScalar       *aa=a->a,*v;
2113   Scalar          *x,*b,s1,*t;
2114 
2115   PetscFunctionBegin;
2116   if (!n) PetscFunctionReturn(0);
2117 
2118   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2119   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2120   t  = a->solve_work;
2121 
2122   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2123   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2124 
2125   /* forward solve the lower triangular */
2126   t[0] = b[*r++];
2127   for (i=1; i<n; i++) {
2128     v     = aa + ai[i];
2129     vi    = aj + ai[i];
2130     nz    = diag[i] - ai[i];
2131     s1  = b[*r++];
2132     while (nz--) {
2133       s1 -= (*v++)*t[*vi++];
2134     }
2135     t[i] = s1;
2136   }
2137   /* backward solve the upper triangular */
2138   for (i=n-1; i>=0; i--){
2139     v    = aa + diag[i] + 1;
2140     vi   = aj + diag[i] + 1;
2141     nz   = ai[i+1] - diag[i] - 1;
2142     s1 = t[i];
2143     while (nz--) {
2144       s1 -= (*v++)*t[*vi++];
2145     }
2146     x[*c--] = t[i] = aa[diag[i]]*s1;
2147   }
2148 
2149   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2150   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2151   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2152   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2153   PLogFlops(2*1*(a->nz) - a->n);
2154   PetscFunctionReturn(0);
2155 }
2156 /*
2157       Special case where the matrix was ILU(0) factored in the natural
2158    ordering. This eliminates the need for the column and row permutation.
2159 */
2160 #undef __FUNC__
2161 #define __FUNC__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2162 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2163 {
2164   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2165   int             n=a->mbs,*ai=a->i,*aj=a->j;
2166   int             ierr,*diag = a->diag;
2167   MatScalar       *aa=a->a;
2168   Scalar          *x,*b;
2169   Scalar          s1,x1;
2170   MatScalar       *v;
2171   int             jdx,idt,idx,nz,*vi,i;
2172 
2173   PetscFunctionBegin;
2174   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2175   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2176 
2177   /* forward solve the lower triangular */
2178   idx    = 0;
2179   x[0]   = b[0];
2180   for (i=1; i<n; i++) {
2181     v     =  aa      + ai[i];
2182     vi    =  aj      + ai[i];
2183     nz    =  diag[i] - ai[i];
2184     idx   +=  1;
2185     s1  =  b[idx];
2186     while (nz--) {
2187       jdx   = *vi++;
2188       x1    = x[jdx];
2189       s1 -= v[0]*x1;
2190       v    += 1;
2191     }
2192     x[idx]   = s1;
2193   }
2194   /* backward solve the upper triangular */
2195   for (i=n-1; i>=0; i--){
2196     v    = aa + diag[i] + 1;
2197     vi   = aj + diag[i] + 1;
2198     nz   = ai[i+1] - diag[i] - 1;
2199     idt  = i;
2200     s1 = x[idt];
2201     while (nz--) {
2202       idx   = *vi++;
2203       x1    = x[idx];
2204       s1 -= v[0]*x1;
2205       v    += 1;
2206     }
2207     v        = aa +  diag[i];
2208     x[idt]   = v[0]*s1;
2209   }
2210   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2211   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2212   PLogFlops(2*(a->nz) - a->n);
2213   PetscFunctionReturn(0);
2214 }
2215 
2216 /* ----------------------------------------------------------------*/
2217 /*
2218      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2219    except that the data structure of Mat_SeqAIJ is slightly different.
2220    Not a good example of code reuse.
2221 */
2222 extern int MatMissingDiagonal_SeqBAIJ(Mat);
2223 
2224 #undef __FUNC__
2225 #define __FUNC__ "MatILUFactorSymbolic_SeqBAIJ"
2226 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2227 {
2228   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2229   IS          isicol;
2230   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2231   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2232   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2233   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2234   PetscTruth  col_identity,row_identity;
2235   PetscReal   f;
2236 
2237   PetscFunctionBegin;
2238   if (info) {
2239     f             = info->fill;
2240     levels        = (int)info->levels;
2241     diagonal_fill = (int)info->diagonal_fill;
2242   } else {
2243     f             = 1.0;
2244     levels        = 0;
2245     diagonal_fill = 0;
2246   }
2247   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2248 
2249   /* special case that simply copies fill pattern */
2250   PetscValidHeaderSpecific(isrow,IS_COOKIE);
2251   PetscValidHeaderSpecific(iscol,IS_COOKIE);
2252   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2253   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2254   if (!levels && row_identity && col_identity) {
2255     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2256     (*fact)->factor = FACTOR_LU;
2257     b               = (Mat_SeqBAIJ*)(*fact)->data;
2258     if (!b->diag) {
2259       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2260     }
2261     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2262     b->row        = isrow;
2263     b->col        = iscol;
2264     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2265     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2266     b->icol       = isicol;
2267     b->solve_work = (Scalar*)PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2268    /*
2269         Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2270         for ILU(0) factorization with natural ordering
2271    */
2272     switch (b->bs) {
2273     case 2:
2274       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2275       (*fact)->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2276       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
2277       break;
2278     case 3:
2279       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2280       (*fact)->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2281       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
2282       break;
2283     case 4:
2284       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2285       (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2286       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
2287       break;
2288     case 5:
2289       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2290       (*fact)->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
2291       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
2292       break;
2293     case 6:
2294       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2295       (*fact)->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
2296       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
2297       break;
2298     case 7:
2299       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2300       (*fact)->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
2301       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
2302     break;
2303   }
2304     PetscFunctionReturn(0);
2305   }
2306 
2307   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2309 
2310   /* get new row pointers */
2311   ainew = (int*)PetscMalloc((n+1)*sizeof(int));CHKPTRQ(ainew);
2312   ainew[0] = 0;
2313   /* don't know how many column pointers are needed so estimate */
2314   jmax = (int)(f*ai[n] + 1);
2315   ajnew = (int*)PetscMalloc((jmax)*sizeof(int));CHKPTRQ(ajnew);
2316   /* ajfill is level of fill for each fill entry */
2317   ajfill = (int*)PetscMalloc((jmax)*sizeof(int));CHKPTRQ(ajfill);
2318   /* fill is a linked list of nonzeros in active row */
2319   fill = (int*)PetscMalloc((n+1)*sizeof(int));CHKPTRQ(fill);
2320   /* im is level for each filled value */
2321   im = (int*)PetscMalloc((n+1)*sizeof(int));CHKPTRQ(im);
2322   /* dloc is location of diagonal in factor */
2323   dloc = (int*)PetscMalloc((n+1)*sizeof(int));CHKPTRQ(dloc);
2324   dloc[0]  = 0;
2325   for (prow=0; prow<n; prow++) {
2326 
2327     /* copy prow into linked list */
2328     nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2329     if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix");
2330     xi         = aj + ai[r[prow]];
2331     fill[n]    = n;
2332     fill[prow] = -1; /* marker for diagonal entry */
2333     while (nz--) {
2334       fm  = n;
2335       idx = ic[*xi++];
2336       do {
2337         m  = fm;
2338         fm = fill[m];
2339       } while (fm < idx);
2340       fill[m]   = idx;
2341       fill[idx] = fm;
2342       im[idx]   = 0;
2343     }
2344 
2345     /* make sure diagonal entry is included */
2346     if (diagonal_fill && fill[prow] == -1) {
2347       fm = n;
2348       while (fill[fm] < prow) fm = fill[fm];
2349       fill[prow] = fill[fm];  /* insert diagonal into linked list */
2350       fill[fm]   = prow;
2351       im[prow]   = 0;
2352       nzf++;
2353       dcount++;
2354     }
2355 
2356     nzi = 0;
2357     row = fill[n];
2358     while (row < prow) {
2359       incrlev = im[row] + 1;
2360       nz      = dloc[row];
2361       xi      = ajnew  + ainew[row] + nz + 1;
2362       flev    = ajfill + ainew[row] + nz + 1;
2363       nnz     = ainew[row+1] - ainew[row] - nz - 1;
2364       fm      = row;
2365       while (nnz-- > 0) {
2366         idx = *xi++;
2367         if (*flev + incrlev > levels) {
2368           flev++;
2369           continue;
2370         }
2371         do {
2372           m  = fm;
2373           fm = fill[m];
2374         } while (fm < idx);
2375         if (fm != idx) {
2376           im[idx]   = *flev + incrlev;
2377           fill[m]   = idx;
2378           fill[idx] = fm;
2379           fm        = idx;
2380           nzf++;
2381         } else {
2382           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2383         }
2384         flev++;
2385       }
2386       row = fill[row];
2387       nzi++;
2388     }
2389     /* copy new filled row into permanent storage */
2390     ainew[prow+1] = ainew[prow] + nzf;
2391     if (ainew[prow+1] > jmax) {
2392 
2393       /* estimate how much additional space we will need */
2394       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2395       /* just double the memory each time */
2396       int maxadd = jmax;
2397       /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2398       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2399       jmax += maxadd;
2400 
2401       /* allocate a longer ajnew and ajfill */
2402       xi = (int*)PetscMalloc(jmax*sizeof(int));CHKPTRQ(xi);
2403       ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2404       ierr = PetscFree(ajnew);CHKERRQ(ierr);
2405       ajnew = xi;
2406       xi = (int*)PetscMalloc(jmax*sizeof(int));CHKPTRQ(xi);
2407       ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2408       ierr = PetscFree(ajfill);CHKERRQ(ierr);
2409       ajfill = xi;
2410       realloc++; /* count how many reallocations are needed */
2411     }
2412     xi          = ajnew + ainew[prow];
2413     flev        = ajfill + ainew[prow];
2414     dloc[prow]  = nzi;
2415     fm          = fill[n];
2416     while (nzf--) {
2417       *xi++   = fm;
2418       *flev++ = im[fm];
2419       fm      = fill[fm];
2420     }
2421     /* make sure row has diagonal entry */
2422     if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2423       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,1,"Row %d has missing diagonal in factored matrix\n\
2424     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2425     }
2426   }
2427   ierr = PetscFree(ajfill);CHKERRQ(ierr);
2428   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2429   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2430   ierr = PetscFree(fill);CHKERRQ(ierr);
2431   ierr = PetscFree(im);CHKERRQ(ierr);
2432 
2433   {
2434     PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2435     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2436     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2437     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2438     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2439     if (diagonal_fill) {
2440       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2441     }
2442   }
2443 
2444   /* put together the new matrix */
2445   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2446   PLogObjectParent(*fact,isicol);
2447   b = (Mat_SeqBAIJ*)(*fact)->data;
2448   ierr = PetscFree(b->imax);CHKERRQ(ierr);
2449   b->singlemalloc = PETSC_FALSE;
2450   len = bs2*ainew[n]*sizeof(MatScalar);
2451   /* the next line frees the default space generated by the Create() */
2452   ierr = PetscFree(b->a);CHKERRQ(ierr);
2453   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2454   b->a          = (MatScalar*)PetscMalloc(len);CHKPTRQ(b->a);
2455   b->j          = ajnew;
2456   b->i          = ainew;
2457   for (i=0; i<n; i++) dloc[i] += ainew[i];
2458   b->diag       = dloc;
2459   b->ilen       = 0;
2460   b->imax       = 0;
2461   b->row        = isrow;
2462   b->col        = iscol;
2463   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2464   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2465   b->icol       = isicol;
2466   b->solve_work = (Scalar*)PetscMalloc((bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
2467   /* In b structure:  Free imax, ilen, old a, old j.
2468      Allocate dloc, solve_work, new a, new j */
2469   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
2470   b->maxnz          = b->nz = ainew[n];
2471   (*fact)->factor   = FACTOR_LU;
2472 
2473   (*fact)->info.factor_mallocs    = realloc;
2474   (*fact)->info.fill_ratio_given  = f;
2475   (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2476 
2477   PetscFunctionReturn(0);
2478 }
2479 
2480 
2481 
2482 
2483