xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 1ddff7312a0b975f32ee7a516f9d0a6f9dc4e0f4)
1 /*$Id: baijfact2.c,v 1.55 2001/06/22 19:34:59 buschelm Exp bsmith $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   int             *diag = a->diag;
18   MatScalar       *aa=a->a,*v;
19   Scalar          s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24 
25   /* forward solve the U^T */
26   for (i=0; i<n; i++) {
27 
28     v     = aa + diag[i];
29     /* multiply by the inverse of the block diagonal */
30     s1    = (*v++)*b[i];
31     vi    = aj + diag[i] + 1;
32     nz    = ai[i+1] - diag[i] - 1;
33     while (nz--) {
34       x[*vi++]  -= (*v++)*s1;
35     }
36     x[i]   = s1;
37   }
38   /* backward solve the L^T */
39   for (i=n-1; i>=0; i--){
40     v    = aa + diag[i] - 1;
41     vi   = aj + diag[i] - 1;
42     nz   = diag[i] - ai[i];
43     s1   = x[i];
44     while (nz--) {
45       x[*vi--]   -=  (*v--)*s1;
46     }
47   }
48   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50   PetscLogFlops(2*(a->nz) - A->n);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57 {
58   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60   int             *diag = a->diag,oidx;
61   MatScalar       *aa=a->a,*v;
62   Scalar          s1,s2,x1,x2;
63   Scalar          *x,*b;
64 
65   PetscFunctionBegin;
66   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68 
69   /* forward solve the U^T */
70   idx = 0;
71   for (i=0; i<n; i++) {
72 
73     v     = aa + 4*diag[i];
74     /* multiply by the inverse of the block diagonal */
75     x1 = b[idx];   x2 = b[1+idx];
76     s1 = v[0]*x1  +  v[1]*x2;
77     s2 = v[2]*x1  +  v[3]*x2;
78     v += 4;
79 
80     vi    = aj + diag[i] + 1;
81     nz    = ai[i+1] - diag[i] - 1;
82     while (nz--) {
83       oidx = 2*(*vi++);
84       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86       v  += 4;
87     }
88     x[idx]   = s1;x[1+idx] = s2;
89     idx += 2;
90   }
91   /* backward solve the L^T */
92   for (i=n-1; i>=0; i--){
93     v    = aa + 4*diag[i] - 4;
94     vi   = aj + diag[i] - 1;
95     nz   = diag[i] - ai[i];
96     idt  = 2*i;
97     s1   = x[idt];  s2 = x[1+idt];
98     while (nz--) {
99       idx   = 2*(*vi--);
100       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102       v -= 4;
103     }
104   }
105   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107   PetscLogFlops(2*4*(a->nz) - 2*A->n);
108   PetscFunctionReturn(0);
109 }
110 
111 #undef __FUNCT__
112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114 {
115   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117   int             *diag = a->diag,oidx;
118   MatScalar       *aa=a->a,*v;
119   Scalar          s1,s2,s3,x1,x2,x3;
120   Scalar          *x,*b;
121 
122   PetscFunctionBegin;
123   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
124   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125 
126   /* forward solve the U^T */
127   idx = 0;
128   for (i=0; i<n; i++) {
129 
130     v     = aa + 9*diag[i];
131     /* multiply by the inverse of the block diagonal */
132     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
133     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
134     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
135     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
136     v += 9;
137 
138     vi    = aj + diag[i] + 1;
139     nz    = ai[i+1] - diag[i] - 1;
140     while (nz--) {
141       oidx = 3*(*vi++);
142       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
143       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
144       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
145       v  += 9;
146     }
147     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
148     idx += 3;
149   }
150   /* backward solve the L^T */
151   for (i=n-1; i>=0; i--){
152     v    = aa + 9*diag[i] - 9;
153     vi   = aj + diag[i] - 1;
154     nz   = diag[i] - ai[i];
155     idt  = 3*i;
156     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
157     while (nz--) {
158       idx   = 3*(*vi--);
159       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
160       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
161       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
162       v -= 9;
163     }
164   }
165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
167   PetscLogFlops(2*9*(a->nz) - 3*A->n);
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
173 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
176   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
177   int             *diag = a->diag,oidx;
178   MatScalar       *aa=a->a,*v;
179   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
180   Scalar          *x,*b;
181 
182   PetscFunctionBegin;
183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185 
186   /* forward solve the U^T */
187   idx = 0;
188   for (i=0; i<n; i++) {
189 
190     v     = aa + 16*diag[i];
191     /* multiply by the inverse of the block diagonal */
192     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
193     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
194     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
195     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
196     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
197     v += 16;
198 
199     vi    = aj + diag[i] + 1;
200     nz    = ai[i+1] - diag[i] - 1;
201     while (nz--) {
202       oidx = 4*(*vi++);
203       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
204       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
205       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
206       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
207       v  += 16;
208     }
209     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
210     idx += 4;
211   }
212   /* backward solve the L^T */
213   for (i=n-1; i>=0; i--){
214     v    = aa + 16*diag[i] - 16;
215     vi   = aj + diag[i] - 1;
216     nz   = diag[i] - ai[i];
217     idt  = 4*i;
218     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
219     while (nz--) {
220       idx   = 4*(*vi--);
221       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
222       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
223       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
224       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
225       v -= 16;
226     }
227   }
228   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
229   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
230   PetscLogFlops(2*16*(a->nz) - 4*A->n);
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
236 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
237 {
238   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
239   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
240   int             *diag = a->diag,oidx;
241   MatScalar       *aa=a->a,*v;
242   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
243   Scalar          *x,*b;
244 
245   PetscFunctionBegin;
246   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248 
249   /* forward solve the U^T */
250   idx = 0;
251   for (i=0; i<n; i++) {
252 
253     v     = aa + 25*diag[i];
254     /* multiply by the inverse of the block diagonal */
255     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
256     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
257     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
258     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
259     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
260     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
261     v += 25;
262 
263     vi    = aj + diag[i] + 1;
264     nz    = ai[i+1] - diag[i] - 1;
265     while (nz--) {
266       oidx = 5*(*vi++);
267       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
268       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
269       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
270       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
271       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
272       v  += 25;
273     }
274     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
275     idx += 5;
276   }
277   /* backward solve the L^T */
278   for (i=n-1; i>=0; i--){
279     v    = aa + 25*diag[i] - 25;
280     vi   = aj + diag[i] - 1;
281     nz   = diag[i] - ai[i];
282     idt  = 5*i;
283     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
284     while (nz--) {
285       idx   = 5*(*vi--);
286       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
287       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
288       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
289       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
290       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
291       v -= 25;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   PetscLogFlops(2*25*(a->nz) - 5*A->n);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
302 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
305   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306   int             *diag = a->diag,oidx;
307   MatScalar       *aa=a->a,*v;
308   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
309   Scalar          *x,*b;
310 
311   PetscFunctionBegin;
312   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
313   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
314 
315   /* forward solve the U^T */
316   idx = 0;
317   for (i=0; i<n; i++) {
318 
319     v     = aa + 36*diag[i];
320     /* multiply by the inverse of the block diagonal */
321     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
322     x6    = b[5+idx];
323     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
324     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
325     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
326     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
327     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
328     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
329     v += 36;
330 
331     vi    = aj + diag[i] + 1;
332     nz    = ai[i+1] - diag[i] - 1;
333     while (nz--) {
334       oidx = 6*(*vi++);
335       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
336       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
337       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
338       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
339       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
340       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
341       v  += 36;
342     }
343     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
344     x[5+idx] = s6;
345     idx += 6;
346   }
347   /* backward solve the L^T */
348   for (i=n-1; i>=0; i--){
349     v    = aa + 36*diag[i] - 36;
350     vi   = aj + diag[i] - 1;
351     nz   = diag[i] - ai[i];
352     idt  = 6*i;
353     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
354     s6 = x[5+idt];
355     while (nz--) {
356       idx   = 6*(*vi--);
357       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
358       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
359       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
360       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
361       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
362       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
363       v -= 36;
364     }
365   }
366   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
368   PetscLogFlops(2*36*(a->nz) - 6*A->n);
369   PetscFunctionReturn(0);
370 }
371 
372 #undef __FUNCT__
373 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
374 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
375 {
376   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
377   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378   int             *diag = a->diag,oidx;
379   MatScalar       *aa=a->a,*v;
380   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
381   Scalar          *x,*b;
382 
383   PetscFunctionBegin;
384   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386 
387   /* forward solve the U^T */
388   idx = 0;
389   for (i=0; i<n; i++) {
390 
391     v     = aa + 49*diag[i];
392     /* multiply by the inverse of the block diagonal */
393     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
394     x6    = b[5+idx]; x7 = b[6+idx];
395     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
396     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
397     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
398     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
399     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
400     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
401     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
402     v += 49;
403 
404     vi    = aj + diag[i] + 1;
405     nz    = ai[i+1] - diag[i] - 1;
406     while (nz--) {
407       oidx = 7*(*vi++);
408       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
409       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
410       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
411       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
412       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
413       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
414       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
415       v  += 49;
416     }
417     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
418     x[5+idx] = s6;x[6+idx] = s7;
419     idx += 7;
420   }
421   /* backward solve the L^T */
422   for (i=n-1; i>=0; i--){
423     v    = aa + 49*diag[i] - 49;
424     vi   = aj + diag[i] - 1;
425     nz   = diag[i] - ai[i];
426     idt  = 7*i;
427     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
428     s6 = x[5+idt];s7 = x[6+idt];
429     while (nz--) {
430       idx   = 7*(*vi--);
431       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
432       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
433       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
434       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
435       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
436       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
437       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
438       v -= 49;
439     }
440   }
441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443   PetscLogFlops(2*49*(a->nz) - 7*A->n);
444   PetscFunctionReturn(0);
445 }
446 
447 /*---------------------------------------------------------------------------------------------*/
448 #undef __FUNCT__
449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
450 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453   IS              iscol=a->col,isrow=a->row;
454   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455   int             *diag = a->diag;
456   MatScalar       *aa=a->a,*v;
457   Scalar          s1,*x,*b,*t;
458 
459   PetscFunctionBegin;
460   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   t  = a->solve_work;
463 
464   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466 
467   /* copy the b into temp work space according to permutation */
468   for (i=0; i<n; i++) {
469     t[i] = b[c[i]];
470   }
471 
472   /* forward solve the U^T */
473   for (i=0; i<n; i++) {
474 
475     v     = aa + diag[i];
476     /* multiply by the inverse of the block diagonal */
477     s1    = (*v++)*t[i];
478     vi    = aj + diag[i] + 1;
479     nz    = ai[i+1] - diag[i] - 1;
480     while (nz--) {
481       t[*vi++]  -= (*v++)*s1;
482     }
483     t[i]   = s1;
484   }
485   /* backward solve the L^T */
486   for (i=n-1; i>=0; i--){
487     v    = aa + diag[i] - 1;
488     vi   = aj + diag[i] - 1;
489     nz   = diag[i] - ai[i];
490     s1   = t[i];
491     while (nz--) {
492       t[*vi--]   -=  (*v--)*s1;
493     }
494   }
495 
496   /* copy t into x according to permutation */
497   for (i=0; i<n; i++) {
498     x[r[i]]   = t[i];
499   }
500 
501   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505   PetscLogFlops(2*(a->nz) - A->n);
506   PetscFunctionReturn(0);
507 }
508 
509 #undef __FUNCT__
510 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
511 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
512 {
513   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514   IS              iscol=a->col,isrow=a->row;
515   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516   int             *diag = a->diag,ii,ic,ir,oidx;
517   MatScalar       *aa=a->a,*v;
518   Scalar          s1,s2,x1,x2;
519   Scalar          *x,*b,*t;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   t  = a->solve_work;
525 
526   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528 
529   /* copy the b into temp work space according to permutation */
530   ii = 0;
531   for (i=0; i<n; i++) {
532     ic      = 2*c[i];
533     t[ii]   = b[ic];
534     t[ii+1] = b[ic+1];
535     ii += 2;
536   }
537 
538   /* forward solve the U^T */
539   idx = 0;
540   for (i=0; i<n; i++) {
541 
542     v     = aa + 4*diag[i];
543     /* multiply by the inverse of the block diagonal */
544     x1    = t[idx];   x2 = t[1+idx];
545     s1 = v[0]*x1  +  v[1]*x2;
546     s2 = v[2]*x1  +  v[3]*x2;
547     v += 4;
548 
549     vi    = aj + diag[i] + 1;
550     nz    = ai[i+1] - diag[i] - 1;
551     while (nz--) {
552       oidx = 2*(*vi++);
553       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555       v  += 4;
556     }
557     t[idx]   = s1;t[1+idx] = s2;
558     idx += 2;
559   }
560   /* backward solve the L^T */
561   for (i=n-1; i>=0; i--){
562     v    = aa + 4*diag[i] - 4;
563     vi   = aj + diag[i] - 1;
564     nz   = diag[i] - ai[i];
565     idt  = 2*i;
566     s1 = t[idt];  s2 = t[1+idt];
567     while (nz--) {
568       idx   = 2*(*vi--);
569       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571       v -= 4;
572     }
573   }
574 
575   /* copy t into x according to permutation */
576   ii = 0;
577   for (i=0; i<n; i++) {
578     ir      = 2*r[i];
579     x[ir]   = t[ii];
580     x[ir+1] = t[ii+1];
581     ii += 2;
582   }
583 
584   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PetscLogFlops(2*4*(a->nz) - 2*A->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNCT__
593 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
594 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599   int             *diag = a->diag,ii,ic,ir,oidx;
600   MatScalar       *aa=a->a,*v;
601   Scalar          s1,s2,s3,x1,x2,x3;
602   Scalar          *x,*b,*t;
603 
604   PetscFunctionBegin;
605   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607   t  = a->solve_work;
608 
609   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611 
612   /* copy the b into temp work space according to permutation */
613   ii = 0;
614   for (i=0; i<n; i++) {
615     ic      = 3*c[i];
616     t[ii]   = b[ic];
617     t[ii+1] = b[ic+1];
618     t[ii+2] = b[ic+2];
619     ii += 3;
620   }
621 
622   /* forward solve the U^T */
623   idx = 0;
624   for (i=0; i<n; i++) {
625 
626     v     = aa + 9*diag[i];
627     /* multiply by the inverse of the block diagonal */
628     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632     v += 9;
633 
634     vi    = aj + diag[i] + 1;
635     nz    = ai[i+1] - diag[i] - 1;
636     while (nz--) {
637       oidx = 3*(*vi++);
638       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641       v  += 9;
642     }
643     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644     idx += 3;
645   }
646   /* backward solve the L^T */
647   for (i=n-1; i>=0; i--){
648     v    = aa + 9*diag[i] - 9;
649     vi   = aj + diag[i] - 1;
650     nz   = diag[i] - ai[i];
651     idt  = 3*i;
652     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653     while (nz--) {
654       idx   = 3*(*vi--);
655       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658       v -= 9;
659     }
660   }
661 
662   /* copy t into x according to permutation */
663   ii = 0;
664   for (i=0; i<n; i++) {
665     ir      = 3*r[i];
666     x[ir]   = t[ii];
667     x[ir+1] = t[ii+1];
668     x[ir+2] = t[ii+2];
669     ii += 3;
670   }
671 
672   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676   PetscLogFlops(2*9*(a->nz) - 3*A->n);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNCT__
681 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
682 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
683 {
684   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685   IS              iscol=a->col,isrow=a->row;
686   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687   int             *diag = a->diag,ii,ic,ir,oidx;
688   MatScalar       *aa=a->a,*v;
689   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
690   Scalar          *x,*b,*t;
691 
692   PetscFunctionBegin;
693   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695   t  = a->solve_work;
696 
697   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699 
700   /* copy the b into temp work space according to permutation */
701   ii = 0;
702   for (i=0; i<n; i++) {
703     ic      = 4*c[i];
704     t[ii]   = b[ic];
705     t[ii+1] = b[ic+1];
706     t[ii+2] = b[ic+2];
707     t[ii+3] = b[ic+3];
708     ii += 4;
709   }
710 
711   /* forward solve the U^T */
712   idx = 0;
713   for (i=0; i<n; i++) {
714 
715     v     = aa + 16*diag[i];
716     /* multiply by the inverse of the block diagonal */
717     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722     v += 16;
723 
724     vi    = aj + diag[i] + 1;
725     nz    = ai[i+1] - diag[i] - 1;
726     while (nz--) {
727       oidx = 4*(*vi++);
728       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732       v  += 16;
733     }
734     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735     idx += 4;
736   }
737   /* backward solve the L^T */
738   for (i=n-1; i>=0; i--){
739     v    = aa + 16*diag[i] - 16;
740     vi   = aj + diag[i] - 1;
741     nz   = diag[i] - ai[i];
742     idt  = 4*i;
743     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744     while (nz--) {
745       idx   = 4*(*vi--);
746       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750       v -= 16;
751     }
752   }
753 
754   /* copy t into x according to permutation */
755   ii = 0;
756   for (i=0; i<n; i++) {
757     ir      = 4*r[i];
758     x[ir]   = t[ii];
759     x[ir+1] = t[ii+1];
760     x[ir+2] = t[ii+2];
761     x[ir+3] = t[ii+3];
762     ii += 4;
763   }
764 
765   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   PetscLogFlops(2*16*(a->nz) - 4*A->n);
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
775 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778   IS              iscol=a->col,isrow=a->row;
779   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780   int             *diag = a->diag,ii,ic,ir,oidx;
781   MatScalar       *aa=a->a,*v;
782   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783   Scalar          *x,*b,*t;
784 
785   PetscFunctionBegin;
786   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788   t  = a->solve_work;
789 
790   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792 
793   /* copy the b into temp work space according to permutation */
794   ii = 0;
795   for (i=0; i<n; i++) {
796     ic      = 5*c[i];
797     t[ii]   = b[ic];
798     t[ii+1] = b[ic+1];
799     t[ii+2] = b[ic+2];
800     t[ii+3] = b[ic+3];
801     t[ii+4] = b[ic+4];
802     ii += 5;
803   }
804 
805   /* forward solve the U^T */
806   idx = 0;
807   for (i=0; i<n; i++) {
808 
809     v     = aa + 25*diag[i];
810     /* multiply by the inverse of the block diagonal */
811     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817     v += 25;
818 
819     vi    = aj + diag[i] + 1;
820     nz    = ai[i+1] - diag[i] - 1;
821     while (nz--) {
822       oidx = 5*(*vi++);
823       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828       v  += 25;
829     }
830     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831     idx += 5;
832   }
833   /* backward solve the L^T */
834   for (i=n-1; i>=0; i--){
835     v    = aa + 25*diag[i] - 25;
836     vi   = aj + diag[i] - 1;
837     nz   = diag[i] - ai[i];
838     idt  = 5*i;
839     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840     while (nz--) {
841       idx   = 5*(*vi--);
842       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847       v -= 25;
848     }
849   }
850 
851   /* copy t into x according to permutation */
852   ii = 0;
853   for (i=0; i<n; i++) {
854     ir      = 5*r[i];
855     x[ir]   = t[ii];
856     x[ir+1] = t[ii+1];
857     x[ir+2] = t[ii+2];
858     x[ir+3] = t[ii+3];
859     x[ir+4] = t[ii+4];
860     ii += 5;
861   }
862 
863   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867   PetscLogFlops(2*25*(a->nz) - 5*A->n);
868   PetscFunctionReturn(0);
869 }
870 
871 #undef __FUNCT__
872 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
873 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
874 {
875   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876   IS              iscol=a->col,isrow=a->row;
877   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878   int             *diag = a->diag,ii,ic,ir,oidx;
879   MatScalar       *aa=a->a,*v;
880   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881   Scalar          *x,*b,*t;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886   t  = a->solve_work;
887 
888   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890 
891   /* copy the b into temp work space according to permutation */
892   ii = 0;
893   for (i=0; i<n; i++) {
894     ic      = 6*c[i];
895     t[ii]   = b[ic];
896     t[ii+1] = b[ic+1];
897     t[ii+2] = b[ic+2];
898     t[ii+3] = b[ic+3];
899     t[ii+4] = b[ic+4];
900     t[ii+5] = b[ic+5];
901     ii += 6;
902   }
903 
904   /* forward solve the U^T */
905   idx = 0;
906   for (i=0; i<n; i++) {
907 
908     v     = aa + 36*diag[i];
909     /* multiply by the inverse of the block diagonal */
910     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911     x6    = t[5+idx];
912     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918     v += 36;
919 
920     vi    = aj + diag[i] + 1;
921     nz    = ai[i+1] - diag[i] - 1;
922     while (nz--) {
923       oidx = 6*(*vi++);
924       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930       v  += 36;
931     }
932     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933     t[5+idx] = s6;
934     idx += 6;
935   }
936   /* backward solve the L^T */
937   for (i=n-1; i>=0; i--){
938     v    = aa + 36*diag[i] - 36;
939     vi   = aj + diag[i] - 1;
940     nz   = diag[i] - ai[i];
941     idt  = 6*i;
942     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943     s6 = t[5+idt];
944     while (nz--) {
945       idx   = 6*(*vi--);
946       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952       v -= 36;
953     }
954   }
955 
956   /* copy t into x according to permutation */
957   ii = 0;
958   for (i=0; i<n; i++) {
959     ir      = 6*r[i];
960     x[ir]   = t[ii];
961     x[ir+1] = t[ii+1];
962     x[ir+2] = t[ii+2];
963     x[ir+3] = t[ii+3];
964     x[ir+4] = t[ii+4];
965     x[ir+5] = t[ii+5];
966     ii += 6;
967   }
968 
969   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973   PetscLogFlops(2*36*(a->nz) - 6*A->n);
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNCT__
978 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
979 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
980 {
981   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982   IS              iscol=a->col,isrow=a->row;
983   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984   int             *diag = a->diag,ii,ic,ir,oidx;
985   MatScalar       *aa=a->a,*v;
986   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987   Scalar          *x,*b,*t;
988 
989   PetscFunctionBegin;
990   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992   t  = a->solve_work;
993 
994   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996 
997   /* copy the b into temp work space according to permutation */
998   ii = 0;
999   for (i=0; i<n; i++) {
1000     ic      = 7*c[i];
1001     t[ii]   = b[ic];
1002     t[ii+1] = b[ic+1];
1003     t[ii+2] = b[ic+2];
1004     t[ii+3] = b[ic+3];
1005     t[ii+4] = b[ic+4];
1006     t[ii+5] = b[ic+5];
1007     t[ii+6] = b[ic+6];
1008     ii += 7;
1009   }
1010 
1011   /* forward solve the U^T */
1012   idx = 0;
1013   for (i=0; i<n; i++) {
1014 
1015     v     = aa + 49*diag[i];
1016     /* multiply by the inverse of the block diagonal */
1017     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018     x6    = t[5+idx]; x7 = t[6+idx];
1019     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026     v += 49;
1027 
1028     vi    = aj + diag[i] + 1;
1029     nz    = ai[i+1] - diag[i] - 1;
1030     while (nz--) {
1031       oidx = 7*(*vi++);
1032       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039       v  += 49;
1040     }
1041     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042     t[5+idx] = s6;t[6+idx] = s7;
1043     idx += 7;
1044   }
1045   /* backward solve the L^T */
1046   for (i=n-1; i>=0; i--){
1047     v    = aa + 49*diag[i] - 49;
1048     vi   = aj + diag[i] - 1;
1049     nz   = diag[i] - ai[i];
1050     idt  = 7*i;
1051     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052     s6 = t[5+idt];s7 = t[6+idt];
1053     while (nz--) {
1054       idx   = 7*(*vi--);
1055       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062       v -= 49;
1063     }
1064   }
1065 
1066   /* copy t into x according to permutation */
1067   ii = 0;
1068   for (i=0; i<n; i++) {
1069     ir      = 7*r[i];
1070     x[ir]   = t[ii];
1071     x[ir+1] = t[ii+1];
1072     x[ir+2] = t[ii+2];
1073     x[ir+3] = t[ii+3];
1074     x[ir+4] = t[ii+4];
1075     x[ir+5] = t[ii+5];
1076     x[ir+6] = t[ii+6];
1077     ii += 7;
1078   }
1079 
1080   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* ----------------------------------------------------------- */
1089 #undef __FUNCT__
1090 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1091 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1092 {
1093   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1094   IS              iscol=a->col,isrow=a->row;
1095   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1096   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1097   MatScalar       *aa=a->a,*v;
1098   Scalar          *x,*b,*s,*t,*ls;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103   t  = a->solve_work;
1104 
1105   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1106   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1107 
1108   /* forward solve the lower triangular */
1109   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1110   for (i=1; i<n; i++) {
1111     v   = aa + bs2*ai[i];
1112     vi  = aj + ai[i];
1113     nz  = a->diag[i] - ai[i];
1114     s = t + bs*i;
1115     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
1116     while (nz--) {
1117       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1118       v += bs2;
1119     }
1120   }
1121   /* backward solve the upper triangular */
1122   ls = a->solve_work + A->n;
1123   for (i=n-1; i>=0; i--){
1124     v   = aa + bs2*(a->diag[i] + 1);
1125     vi  = aj + a->diag[i] + 1;
1126     nz  = ai[i+1] - a->diag[i] - 1;
1127     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1128     while (nz--) {
1129       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1130       v += bs2;
1131     }
1132     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1137   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNCT__
1145 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1146 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1149   IS              iscol=a->col,isrow=a->row;
1150   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1151   int             *diag = a->diag;
1152   MatScalar       *aa=a->a,*v;
1153   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154   Scalar          *x,*b,*t;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   t  = a->solve_work;
1160 
1161   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1162   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1163 
1164   /* forward solve the lower triangular */
1165   idx    = 7*(*r++);
1166   t[0] = b[idx];   t[1] = b[1+idx];
1167   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168   t[5] = b[5+idx]; t[6] = b[6+idx];
1169 
1170   for (i=1; i<n; i++) {
1171     v     = aa + 49*ai[i];
1172     vi    = aj + ai[i];
1173     nz    = diag[i] - ai[i];
1174     idx   = 7*(*r++);
1175     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1177     while (nz--) {
1178       idx   = 7*(*vi++);
1179       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180       x4    = t[3+idx];x5 = t[4+idx];
1181       x6    = t[5+idx];x7 = t[6+idx];
1182       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1189       v += 49;
1190     }
1191     idx = 7*i;
1192     t[idx]   = s1;t[1+idx] = s2;
1193     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194     t[5+idx] = s6;t[6+idx] = s7;
1195   }
1196   /* backward solve the upper triangular */
1197   for (i=n-1; i>=0; i--){
1198     v    = aa + 49*diag[i] + 49;
1199     vi   = aj + diag[i] + 1;
1200     nz   = ai[i+1] - diag[i] - 1;
1201     idt  = 7*i;
1202     s1 = t[idt];  s2 = t[1+idt];
1203     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204     s6 = t[5+idt];s7 = t[6+idt];
1205     while (nz--) {
1206       idx   = 7*(*vi++);
1207       x1    = t[idx];   x2 = t[1+idx];
1208       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209       x6    = t[5+idx]; x7 = t[6+idx];
1210       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1217       v += 49;
1218     }
1219     idc = 7*(*c--);
1220     v   = aa + 49*diag[i];
1221     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1235   }
1236 
1237   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1238   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 #undef __FUNCT__
1246 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1247 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1248 {
1249   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1250   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1251   int             ierr,*diag = a->diag,jdx;
1252   MatScalar       *aa=a->a,*v;
1253   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1254 
1255   PetscFunctionBegin;
1256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1258   /* forward solve the lower triangular */
1259   idx    = 0;
1260   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1261   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1262   x[6] = b[6+idx];
1263   for (i=1; i<n; i++) {
1264     v     =  aa + 49*ai[i];
1265     vi    =  aj + ai[i];
1266     nz    =  diag[i] - ai[i];
1267     idx   =  7*i;
1268     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270     s7  =  b[6+idx];
1271     while (nz--) {
1272       jdx   = 7*(*vi++);
1273       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1274       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1275       x7    = x[6+jdx];
1276       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1283       v += 49;
1284      }
1285     x[idx]   = s1;
1286     x[1+idx] = s2;
1287     x[2+idx] = s3;
1288     x[3+idx] = s4;
1289     x[4+idx] = s5;
1290     x[5+idx] = s6;
1291     x[6+idx] = s7;
1292   }
1293   /* backward solve the upper triangular */
1294   for (i=n-1; i>=0; i--){
1295     v    = aa + 49*diag[i] + 49;
1296     vi   = aj + diag[i] + 1;
1297     nz   = ai[i+1] - diag[i] - 1;
1298     idt  = 7*i;
1299     s1 = x[idt];   s2 = x[1+idt];
1300     s3 = x[2+idt]; s4 = x[3+idt];
1301     s5 = x[4+idt]; s6 = x[5+idt];
1302     s7 = x[6+idt];
1303     while (nz--) {
1304       idx   = 7*(*vi++);
1305       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1306       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1307       x7    = x[6+idx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316     }
1317     v        = aa + 49*diag[i];
1318     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1332   }
1333 
1334   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNCT__
1341 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1342 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1343 {
1344   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1345   IS              iscol=a->col,isrow=a->row;
1346   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1347   int             *diag = a->diag;
1348   MatScalar       *aa=a->a,*v;
1349   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1350 
1351   PetscFunctionBegin;
1352   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1353   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354   t  = a->solve_work;
1355 
1356   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1357   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1358 
1359   /* forward solve the lower triangular */
1360   idx    = 6*(*r++);
1361   t[0] = b[idx];   t[1] = b[1+idx];
1362   t[2] = b[2+idx]; t[3] = b[3+idx];
1363   t[4] = b[4+idx]; t[5] = b[5+idx];
1364   for (i=1; i<n; i++) {
1365     v     = aa + 36*ai[i];
1366     vi    = aj + ai[i];
1367     nz    = diag[i] - ai[i];
1368     idx   = 6*(*r++);
1369     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370     s5  = b[4+idx]; s6 = b[5+idx];
1371     while (nz--) {
1372       idx   = 6*(*vi++);
1373       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1381       v += 36;
1382     }
1383     idx = 6*i;
1384     t[idx]   = s1;t[1+idx] = s2;
1385     t[2+idx] = s3;t[3+idx] = s4;
1386     t[4+idx] = s5;t[5+idx] = s6;
1387   }
1388   /* backward solve the upper triangular */
1389   for (i=n-1; i>=0; i--){
1390     v    = aa + 36*diag[i] + 36;
1391     vi   = aj + diag[i] + 1;
1392     nz   = ai[i+1] - diag[i] - 1;
1393     idt  = 6*i;
1394     s1 = t[idt];  s2 = t[1+idt];
1395     s3 = t[2+idt];s4 = t[3+idt];
1396     s5 = t[4+idt];s6 = t[5+idt];
1397     while (nz--) {
1398       idx   = 6*(*vi++);
1399       x1    = t[idx];   x2 = t[1+idx];
1400       x3    = t[2+idx]; x4 = t[3+idx];
1401       x5    = t[4+idx]; x6 = t[5+idx];
1402       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1408       v += 36;
1409     }
1410     idc = 6*(*c--);
1411     v   = aa + 36*diag[i];
1412     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423                                  v[23]*s4+v[29]*s5+v[35]*s6;
1424   }
1425 
1426   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1427   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1428   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 #undef __FUNCT__
1435 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1436 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1437 {
1438   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1439   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1440   int             ierr,*diag = a->diag,jdx;
1441   MatScalar       *aa=a->a,*v;
1442   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1443 
1444   PetscFunctionBegin;
1445   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1446   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1447   /* forward solve the lower triangular */
1448   idx    = 0;
1449   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1450   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1451   for (i=1; i<n; i++) {
1452     v     =  aa + 36*ai[i];
1453     vi    =  aj + ai[i];
1454     nz    =  diag[i] - ai[i];
1455     idx   =  6*i;
1456     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1458     while (nz--) {
1459       jdx   = 6*(*vi++);
1460       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1461       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1468       v += 36;
1469      }
1470     x[idx]   = s1;
1471     x[1+idx] = s2;
1472     x[2+idx] = s3;
1473     x[3+idx] = s4;
1474     x[4+idx] = s5;
1475     x[5+idx] = s6;
1476   }
1477   /* backward solve the upper triangular */
1478   for (i=n-1; i>=0; i--){
1479     v    = aa + 36*diag[i] + 36;
1480     vi   = aj + diag[i] + 1;
1481     nz   = ai[i+1] - diag[i] - 1;
1482     idt  = 6*i;
1483     s1 = x[idt];   s2 = x[1+idt];
1484     s3 = x[2+idt]; s4 = x[3+idt];
1485     s5 = x[4+idt]; s6 = x[5+idt];
1486     while (nz--) {
1487       idx   = 6*(*vi++);
1488       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1489       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496       v += 36;
1497     }
1498     v        = aa + 36*diag[i];
1499     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1505   }
1506 
1507   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1508   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 #undef __FUNCT__
1514 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1515 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1516 {
1517   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1518   IS              iscol=a->col,isrow=a->row;
1519   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1520   int             *diag = a->diag;
1521   MatScalar       *aa=a->a,*v;
1522   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1523 
1524   PetscFunctionBegin;
1525   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527   t  = a->solve_work;
1528 
1529   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1530   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1531 
1532   /* forward solve the lower triangular */
1533   idx    = 5*(*r++);
1534   t[0] = b[idx];   t[1] = b[1+idx];
1535   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1536   for (i=1; i<n; i++) {
1537     v     = aa + 25*ai[i];
1538     vi    = aj + ai[i];
1539     nz    = diag[i] - ai[i];
1540     idx   = 5*(*r++);
1541     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542     s5  = b[4+idx];
1543     while (nz--) {
1544       idx   = 5*(*vi++);
1545       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546       x4    = t[3+idx];x5 = t[4+idx];
1547       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1552       v += 25;
1553     }
1554     idx = 5*i;
1555     t[idx]   = s1;t[1+idx] = s2;
1556     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1557   }
1558   /* backward solve the upper triangular */
1559   for (i=n-1; i>=0; i--){
1560     v    = aa + 25*diag[i] + 25;
1561     vi   = aj + diag[i] + 1;
1562     nz   = ai[i+1] - diag[i] - 1;
1563     idt  = 5*i;
1564     s1 = t[idt];  s2 = t[1+idt];
1565     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1566     while (nz--) {
1567       idx   = 5*(*vi++);
1568       x1    = t[idx];   x2 = t[1+idx];
1569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1575       v += 25;
1576     }
1577     idc = 5*(*c--);
1578     v   = aa + 25*diag[i];
1579     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580                                  v[15]*s4+v[20]*s5;
1581     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582                                  v[16]*s4+v[21]*s5;
1583     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584                                  v[17]*s4+v[22]*s5;
1585     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586                                  v[18]*s4+v[23]*s5;
1587     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588                                  v[19]*s4+v[24]*s5;
1589   }
1590 
1591   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1592   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1601 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1604   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1605   int             ierr,*diag = a->diag,jdx;
1606   MatScalar       *aa=a->a,*v;
1607   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1608 
1609   PetscFunctionBegin;
1610   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1612   /* forward solve the lower triangular */
1613   idx    = 0;
1614   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1615   for (i=1; i<n; i++) {
1616     v     =  aa + 25*ai[i];
1617     vi    =  aj + ai[i];
1618     nz    =  diag[i] - ai[i];
1619     idx   =  5*i;
1620     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1621     while (nz--) {
1622       jdx   = 5*(*vi++);
1623       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1629       v    += 25;
1630     }
1631     x[idx]   = s1;
1632     x[1+idx] = s2;
1633     x[2+idx] = s3;
1634     x[3+idx] = s4;
1635     x[4+idx] = s5;
1636   }
1637   /* backward solve the upper triangular */
1638   for (i=n-1; i>=0; i--){
1639     v    = aa + 25*diag[i] + 25;
1640     vi   = aj + diag[i] + 1;
1641     nz   = ai[i+1] - diag[i] - 1;
1642     idt  = 5*i;
1643     s1 = x[idt];  s2 = x[1+idt];
1644     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1645     while (nz--) {
1646       idx   = 5*(*vi++);
1647       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1653       v    += 25;
1654     }
1655     v        = aa + 25*diag[i];
1656     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1661   }
1662 
1663   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1664   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1666   PetscFunctionReturn(0);
1667 }
1668 
1669 #undef __FUNCT__
1670 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1671 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1672 {
1673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1674   IS              iscol=a->col,isrow=a->row;
1675   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1676   int             *diag = a->diag;
1677   MatScalar       *aa=a->a,*v;
1678   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1679 
1680   PetscFunctionBegin;
1681   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683   t  = a->solve_work;
1684 
1685   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1686   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1687 
1688   /* forward solve the lower triangular */
1689   idx    = 4*(*r++);
1690   t[0] = b[idx];   t[1] = b[1+idx];
1691   t[2] = b[2+idx]; t[3] = b[3+idx];
1692   for (i=1; i<n; i++) {
1693     v     = aa + 16*ai[i];
1694     vi    = aj + ai[i];
1695     nz    = diag[i] - ai[i];
1696     idx   = 4*(*r++);
1697     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1698     while (nz--) {
1699       idx   = 4*(*vi++);
1700       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1705       v    += 16;
1706     }
1707     idx        = 4*i;
1708     t[idx]   = s1;t[1+idx] = s2;
1709     t[2+idx] = s3;t[3+idx] = s4;
1710   }
1711   /* backward solve the upper triangular */
1712   for (i=n-1; i>=0; i--){
1713     v    = aa + 16*diag[i] + 16;
1714     vi   = aj + diag[i] + 1;
1715     nz   = ai[i+1] - diag[i] - 1;
1716     idt  = 4*i;
1717     s1 = t[idt];  s2 = t[1+idt];
1718     s3 = t[2+idt];s4 = t[3+idt];
1719     while (nz--) {
1720       idx   = 4*(*vi++);
1721       x1    = t[idx];   x2 = t[1+idx];
1722       x3    = t[2+idx]; x4 = t[3+idx];
1723       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1727       v += 16;
1728     }
1729     idc      = 4*(*c--);
1730     v        = aa + 16*diag[i];
1731     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 
1746 /*
1747       Special case where the matrix was ILU(0) factored in the natural
1748    ordering. This eliminates the need for the column and row permutation.
1749 */
1750 #undef __FUNCT__
1751 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
1752 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1753 {
1754   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1755   int             n=a->mbs,*ai=a->i,*aj=a->j;
1756   int             ierr,*diag = a->diag;
1757   MatScalar       *aa=a->a;
1758   Scalar          *x,*b;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763 
1764 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1765   {
1766     static Scalar w[2000]; /* very BAD need to fix */
1767     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1768   }
1769 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1770   {
1771     static Scalar w[2000]; /* very BAD need to fix */
1772     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1773   }
1774 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1775   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1776 #else
1777   {
1778     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1779     MatScalar *v;
1780     int       jdx,idt,idx,nz,*vi,i,ai16;
1781 
1782   /* forward solve the lower triangular */
1783   idx    = 0;
1784   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1785   for (i=1; i<n; i++) {
1786     v     =  aa      + 16*ai[i];
1787     vi    =  aj      + ai[i];
1788     nz    =  diag[i] - ai[i];
1789     idx   +=  4;
1790     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1791     while (nz--) {
1792       jdx   = 4*(*vi++);
1793       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1794       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1795       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1796       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1797       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1798       v    += 16;
1799     }
1800     x[idx]   = s1;
1801     x[1+idx] = s2;
1802     x[2+idx] = s3;
1803     x[3+idx] = s4;
1804   }
1805   /* backward solve the upper triangular */
1806   idt = 4*(n-1);
1807   for (i=n-1; i>=0; i--){
1808     ai16 = 16*diag[i];
1809     v    = aa + ai16 + 16;
1810     vi   = aj + diag[i] + 1;
1811     nz   = ai[i+1] - diag[i] - 1;
1812     s1 = x[idt];  s2 = x[1+idt];
1813     s3 = x[2+idt];s4 = x[3+idt];
1814     while (nz--) {
1815       idx   = 4*(*vi++);
1816       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1817       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1818       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1819       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1820       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1821       v    += 16;
1822     }
1823     v        = aa + ai16;
1824     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1825     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1826     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1827     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1828     idt -= 4;
1829   }
1830   }
1831 #endif
1832 
1833   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1834   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1835   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 #if defined (PETSC_HAVE_SSE)
1840 
1841 #include PETSC_HAVE_SSE
1842 
1843 #undef __FUNCT__
1844 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
1845 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
1846 {
1847   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1848   int             n=a->mbs,*ai=a->i,*aj=a->j;
1849   int             ierr,*diag = a->diag;
1850   MatScalar       *aa=a->a;
1851   Scalar          *x,*b;
1852 
1853   PetscFunctionBegin;
1854   SSE_SCOPE_BEGIN;
1855   /*
1856      Note: This code currently uses demotion of double
1857      to float when performing the mixed-mode computation.
1858      This may not be numerically reasonable for all applications.
1859   */
1860   PREFETCH_NTA(aa+16*ai[1]);
1861 
1862   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1863   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1864   {
1865     MatScalar     *v;
1866     int           jdx,idt,idx,nz,*vi,i,ai16;
1867 
1868     /* Make space in temp stack for 16 Byte Aligned arrays */
1869     float         ssealignedspace[11],*tmps,*tmpx;
1870     unsigned long offset = (unsigned long)ssealignedspace % 16;
1871     if (offset) offset = (16 - offset)/4;
1872     tmps = &ssealignedspace[offset];
1873     tmpx = &ssealignedspace[offset+4];
1874 
1875   /* forward solve the lower triangular */
1876     idx  = 0;
1877     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1878     v    =  aa + 16*ai[1];
1879 
1880     for (i=1; i<n;) {
1881       PREFETCH_NTA(&v[8]);
1882       vi   =  aj      + ai[i];
1883       nz   =  diag[i] - ai[i];
1884       idx +=  4;
1885 
1886     /* Demote sum from double to float */
1887       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1888       LOAD_PS(tmps,XMM7);
1889 
1890       while (nz--) {
1891         PREFETCH_NTA(&v[16]);
1892         jdx = 4*(*vi++);
1893 
1894         /* Demote solution (so far) from double to float */
1895         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
1896 
1897         /* 4x4 Matrix-Vector product with negative accumulation: */
1898         SSE_INLINE_BEGIN_2(tmpx,v)
1899           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1900 
1901           /* First Column */
1902           SSE_COPY_PS(XMM0,XMM6)
1903           SSE_SHUFFLE(XMM0,XMM0,0x00)
1904           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1905           SSE_SUB_PS(XMM7,XMM0)
1906 
1907           /* Second Column */
1908           SSE_COPY_PS(XMM1,XMM6)
1909           SSE_SHUFFLE(XMM1,XMM1,0x55)
1910           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1911           SSE_SUB_PS(XMM7,XMM1)
1912 
1913           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1914 
1915           /* Third Column */
1916           SSE_COPY_PS(XMM2,XMM6)
1917           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1918           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1919           SSE_SUB_PS(XMM7,XMM2)
1920 
1921           /* Fourth Column */
1922           SSE_COPY_PS(XMM3,XMM6)
1923           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1924           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1925           SSE_SUB_PS(XMM7,XMM3)
1926         SSE_INLINE_END_2
1927 
1928         v  += 16;
1929       }
1930       v    =  aa + 16*ai[++i];
1931       PREFETCH_NTA(v);
1932       STORE_PS(tmps,XMM7);
1933 
1934       /* Promote result from float to double */
1935       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
1936     }
1937     /* backward solve the upper triangular */
1938     idt  = 4*(n-1);
1939     ai16 = 16*diag[n-1];
1940     v    = aa + ai16 + 16;
1941     for (i=n-1; i>=0;){
1942       PREFETCH_NTA(&v[8]);
1943       vi = aj + diag[i] + 1;
1944       nz = ai[i+1] - diag[i] - 1;
1945 
1946       /* Demote accumulator from double to float */
1947       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
1948       LOAD_PS(tmps,XMM7);
1949 
1950       while (nz--) {
1951         PREFETCH_NTA(&v[16]);
1952         idx = 4*(*vi++);
1953 
1954         /* Demote solution (so far) from double to float */
1955         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1956 
1957         /* 4x4 Matrix-Vector Product with negative accumulation: */
1958         SSE_INLINE_BEGIN_2(tmpx,v)
1959           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1960 
1961           /* First Column */
1962           SSE_COPY_PS(XMM0,XMM6)
1963           SSE_SHUFFLE(XMM0,XMM0,0x00)
1964           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1965           SSE_SUB_PS(XMM7,XMM0)
1966 
1967           /* Second Column */
1968           SSE_COPY_PS(XMM1,XMM6)
1969           SSE_SHUFFLE(XMM1,XMM1,0x55)
1970           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1971           SSE_SUB_PS(XMM7,XMM1)
1972 
1973           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1974 
1975           /* Third Column */
1976           SSE_COPY_PS(XMM2,XMM6)
1977           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1978           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1979           SSE_SUB_PS(XMM7,XMM2)
1980 
1981           /* Fourth Column */
1982           SSE_COPY_PS(XMM3,XMM6)
1983           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1984           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1985           SSE_SUB_PS(XMM7,XMM3)
1986         SSE_INLINE_END_2
1987         v  += 16;
1988       }
1989       v    = aa + ai16;
1990       ai16 = 16*diag[--i];
1991       PREFETCH_NTA(aa+ai16+16);
1992       /*
1993          Scale the result by the diagonal 4x4 block,
1994          which was inverted as part of the factorization
1995       */
1996       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1997         /* First Column */
1998         SSE_COPY_PS(XMM0,XMM7)
1999         SSE_SHUFFLE(XMM0,XMM0,0x00)
2000         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2001 
2002         /* Second Column */
2003         SSE_COPY_PS(XMM1,XMM7)
2004         SSE_SHUFFLE(XMM1,XMM1,0x55)
2005         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2006         SSE_ADD_PS(XMM0,XMM1)
2007 
2008         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2009 
2010         /* Third Column */
2011         SSE_COPY_PS(XMM2,XMM7)
2012         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2013         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2014         SSE_ADD_PS(XMM0,XMM2)
2015 
2016         /* Fourth Column */
2017         SSE_COPY_PS(XMM3,XMM7)
2018         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2019         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2020         SSE_ADD_PS(XMM0,XMM3)
2021 
2022         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2023       SSE_INLINE_END_3
2024 
2025       /* Promote solution from float to double */
2026       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
2027 
2028       v    = aa + ai16 + 16;
2029       idt -= 4;
2030     }
2031   }
2032   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2033   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2034   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2035   SSE_SCOPE_END;
2036   PetscFunctionReturn(0);
2037 }
2038 
2039 #endif
2040 #undef __FUNCT__
2041 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2042 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2043 {
2044   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2045   IS              iscol=a->col,isrow=a->row;
2046   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2047   int             *diag = a->diag;
2048   MatScalar       *aa=a->a,*v;
2049   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
2050 
2051   PetscFunctionBegin;
2052   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2053   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2054   t  = a->solve_work;
2055 
2056   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2057   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2058 
2059   /* forward solve the lower triangular */
2060   idx    = 3*(*r++);
2061   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2062   for (i=1; i<n; i++) {
2063     v     = aa + 9*ai[i];
2064     vi    = aj + ai[i];
2065     nz    = diag[i] - ai[i];
2066     idx   = 3*(*r++);
2067     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2068     while (nz--) {
2069       idx   = 3*(*vi++);
2070       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2071       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2072       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2073       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2074       v += 9;
2075     }
2076     idx = 3*i;
2077     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2078   }
2079   /* backward solve the upper triangular */
2080   for (i=n-1; i>=0; i--){
2081     v    = aa + 9*diag[i] + 9;
2082     vi   = aj + diag[i] + 1;
2083     nz   = ai[i+1] - diag[i] - 1;
2084     idt  = 3*i;
2085     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2086     while (nz--) {
2087       idx   = 3*(*vi++);
2088       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2089       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2090       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2091       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2092       v += 9;
2093     }
2094     idc = 3*(*c--);
2095     v   = aa + 9*diag[i];
2096     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2097     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2098     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2099   }
2100   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2101   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2102   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2103   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2104   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2105   PetscFunctionReturn(0);
2106 }
2107 
2108 /*
2109       Special case where the matrix was ILU(0) factored in the natural
2110    ordering. This eliminates the need for the column and row permutation.
2111 */
2112 #undef __FUNCT__
2113 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2114 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2115 {
2116   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2117   int             n=a->mbs,*ai=a->i,*aj=a->j;
2118   int             ierr,*diag = a->diag;
2119   MatScalar       *aa=a->a,*v;
2120   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
2121   int             jdx,idt,idx,nz,*vi,i;
2122 
2123   PetscFunctionBegin;
2124   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2126 
2127 
2128   /* forward solve the lower triangular */
2129   idx    = 0;
2130   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2131   for (i=1; i<n; i++) {
2132     v     =  aa      + 9*ai[i];
2133     vi    =  aj      + ai[i];
2134     nz    =  diag[i] - ai[i];
2135     idx   +=  3;
2136     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2137     while (nz--) {
2138       jdx   = 3*(*vi++);
2139       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2140       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2141       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2142       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2143       v    += 9;
2144     }
2145     x[idx]   = s1;
2146     x[1+idx] = s2;
2147     x[2+idx] = s3;
2148   }
2149   /* backward solve the upper triangular */
2150   for (i=n-1; i>=0; i--){
2151     v    = aa + 9*diag[i] + 9;
2152     vi   = aj + diag[i] + 1;
2153     nz   = ai[i+1] - diag[i] - 1;
2154     idt  = 3*i;
2155     s1 = x[idt];  s2 = x[1+idt];
2156     s3 = x[2+idt];
2157     while (nz--) {
2158       idx   = 3*(*vi++);
2159       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2160       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2161       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2162       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2163       v    += 9;
2164     }
2165     v        = aa +  9*diag[i];
2166     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2167     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2168     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2169   }
2170 
2171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2173   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2174   PetscFunctionReturn(0);
2175 }
2176 
2177 #undef __FUNCT__
2178 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2179 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2180 {
2181   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2182   IS              iscol=a->col,isrow=a->row;
2183   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2184   int             *diag = a->diag;
2185   MatScalar       *aa=a->a,*v;
2186   Scalar          *x,*b,s1,s2,x1,x2,*t;
2187 
2188   PetscFunctionBegin;
2189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2191   t  = a->solve_work;
2192 
2193   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2194   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2195 
2196   /* forward solve the lower triangular */
2197   idx    = 2*(*r++);
2198   t[0] = b[idx]; t[1] = b[1+idx];
2199   for (i=1; i<n; i++) {
2200     v     = aa + 4*ai[i];
2201     vi    = aj + ai[i];
2202     nz    = diag[i] - ai[i];
2203     idx   = 2*(*r++);
2204     s1  = b[idx]; s2 = b[1+idx];
2205     while (nz--) {
2206       idx   = 2*(*vi++);
2207       x1    = t[idx]; x2 = t[1+idx];
2208       s1 -= v[0]*x1 + v[2]*x2;
2209       s2 -= v[1]*x1 + v[3]*x2;
2210       v += 4;
2211     }
2212     idx = 2*i;
2213     t[idx] = s1; t[1+idx] = s2;
2214   }
2215   /* backward solve the upper triangular */
2216   for (i=n-1; i>=0; i--){
2217     v    = aa + 4*diag[i] + 4;
2218     vi   = aj + diag[i] + 1;
2219     nz   = ai[i+1] - diag[i] - 1;
2220     idt  = 2*i;
2221     s1 = t[idt]; s2 = t[1+idt];
2222     while (nz--) {
2223       idx   = 2*(*vi++);
2224       x1    = t[idx]; x2 = t[1+idx];
2225       s1 -= v[0]*x1 + v[2]*x2;
2226       s2 -= v[1]*x1 + v[3]*x2;
2227       v += 4;
2228     }
2229     idc = 2*(*c--);
2230     v   = aa + 4*diag[i];
2231     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2232     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2233   }
2234   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2235   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2238   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2239   PetscFunctionReturn(0);
2240 }
2241 
2242 /*
2243       Special case where the matrix was ILU(0) factored in the natural
2244    ordering. This eliminates the need for the column and row permutation.
2245 */
2246 #undef __FUNCT__
2247 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2248 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2249 {
2250   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2251   int             n=a->mbs,*ai=a->i,*aj=a->j;
2252   int             ierr,*diag = a->diag;
2253   MatScalar       *aa=a->a,*v;
2254   Scalar          *x,*b,s1,s2,x1,x2;
2255   int             jdx,idt,idx,nz,*vi,i;
2256 
2257   PetscFunctionBegin;
2258   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2260 
2261   /* forward solve the lower triangular */
2262   idx    = 0;
2263   x[0]   = b[0]; x[1] = b[1];
2264   for (i=1; i<n; i++) {
2265     v     =  aa      + 4*ai[i];
2266     vi    =  aj      + ai[i];
2267     nz    =  diag[i] - ai[i];
2268     idx   +=  2;
2269     s1  =  b[idx];s2 = b[1+idx];
2270     while (nz--) {
2271       jdx   = 2*(*vi++);
2272       x1    = x[jdx];x2 = x[1+jdx];
2273       s1 -= v[0]*x1 + v[2]*x2;
2274       s2 -= v[1]*x1 + v[3]*x2;
2275       v    += 4;
2276     }
2277     x[idx]   = s1;
2278     x[1+idx] = s2;
2279   }
2280   /* backward solve the upper triangular */
2281   for (i=n-1; i>=0; i--){
2282     v    = aa + 4*diag[i] + 4;
2283     vi   = aj + diag[i] + 1;
2284     nz   = ai[i+1] - diag[i] - 1;
2285     idt  = 2*i;
2286     s1 = x[idt];  s2 = x[1+idt];
2287     while (nz--) {
2288       idx   = 2*(*vi++);
2289       x1    = x[idx];   x2 = x[1+idx];
2290       s1 -= v[0]*x1 + v[2]*x2;
2291       s2 -= v[1]*x1 + v[3]*x2;
2292       v    += 4;
2293     }
2294     v        = aa +  4*diag[i];
2295     x[idt]   = v[0]*s1 + v[2]*s2;
2296     x[1+idt] = v[1]*s1 + v[3]*s2;
2297   }
2298 
2299   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2300   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2301   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2302   PetscFunctionReturn(0);
2303 }
2304 
2305 #undef __FUNCT__
2306 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2307 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2308 {
2309   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2310   IS              iscol=a->col,isrow=a->row;
2311   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2312   int             *diag = a->diag;
2313   MatScalar       *aa=a->a,*v;
2314   Scalar          *x,*b,s1,*t;
2315 
2316   PetscFunctionBegin;
2317   if (!n) PetscFunctionReturn(0);
2318 
2319   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2320   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2321   t  = a->solve_work;
2322 
2323   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2324   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2325 
2326   /* forward solve the lower triangular */
2327   t[0] = b[*r++];
2328   for (i=1; i<n; i++) {
2329     v     = aa + ai[i];
2330     vi    = aj + ai[i];
2331     nz    = diag[i] - ai[i];
2332     s1  = b[*r++];
2333     while (nz--) {
2334       s1 -= (*v++)*t[*vi++];
2335     }
2336     t[i] = s1;
2337   }
2338   /* backward solve the upper triangular */
2339   for (i=n-1; i>=0; i--){
2340     v    = aa + diag[i] + 1;
2341     vi   = aj + diag[i] + 1;
2342     nz   = ai[i+1] - diag[i] - 1;
2343     s1 = t[i];
2344     while (nz--) {
2345       s1 -= (*v++)*t[*vi++];
2346     }
2347     x[*c--] = t[i] = aa[diag[i]]*s1;
2348   }
2349 
2350   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2351   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2352   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2353   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2354   PetscLogFlops(2*1*(a->nz) - A->n);
2355   PetscFunctionReturn(0);
2356 }
2357 /*
2358       Special case where the matrix was ILU(0) factored in the natural
2359    ordering. This eliminates the need for the column and row permutation.
2360 */
2361 #undef __FUNCT__
2362 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
2363 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2364 {
2365   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2366   int             n=a->mbs,*ai=a->i,*aj=a->j;
2367   int             ierr,*diag = a->diag;
2368   MatScalar       *aa=a->a;
2369   Scalar          *x,*b;
2370   Scalar          s1,x1;
2371   MatScalar       *v;
2372   int             jdx,idt,idx,nz,*vi,i;
2373 
2374   PetscFunctionBegin;
2375   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2377 
2378   /* forward solve the lower triangular */
2379   idx    = 0;
2380   x[0]   = b[0];
2381   for (i=1; i<n; i++) {
2382     v     =  aa      + ai[i];
2383     vi    =  aj      + ai[i];
2384     nz    =  diag[i] - ai[i];
2385     idx   +=  1;
2386     s1  =  b[idx];
2387     while (nz--) {
2388       jdx   = *vi++;
2389       x1    = x[jdx];
2390       s1 -= v[0]*x1;
2391       v    += 1;
2392     }
2393     x[idx]   = s1;
2394   }
2395   /* backward solve the upper triangular */
2396   for (i=n-1; i>=0; i--){
2397     v    = aa + diag[i] + 1;
2398     vi   = aj + diag[i] + 1;
2399     nz   = ai[i+1] - diag[i] - 1;
2400     idt  = i;
2401     s1 = x[idt];
2402     while (nz--) {
2403       idx   = *vi++;
2404       x1    = x[idx];
2405       s1 -= v[0]*x1;
2406       v    += 1;
2407     }
2408     v        = aa +  diag[i];
2409     x[idt]   = v[0]*s1;
2410   }
2411   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2412   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2413   PetscLogFlops(2*(a->nz) - A->n);
2414   PetscFunctionReturn(0);
2415 }
2416 
2417 /* ----------------------------------------------------------------*/
2418 /*
2419      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2420    except that the data structure of Mat_SeqAIJ is slightly different.
2421    Not a good example of code reuse.
2422 */
2423 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2424 
2425 #undef __FUNCT__
2426 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2427 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2428 {
2429   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2430   IS          isicol;
2431   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2432   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2433   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2434   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2435   PetscTruth  col_identity,row_identity;
2436   PetscReal   f;
2437 
2438   PetscFunctionBegin;
2439   if (info) {
2440     f             = info->fill;
2441     levels        = (int)info->levels;
2442     diagonal_fill = (int)info->diagonal_fill;
2443   } else {
2444     f             = 1.0;
2445     levels        = 0;
2446     diagonal_fill = 0;
2447   }
2448   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2449   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2450   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2451 
2452   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2453     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2454     (*fact)->factor = FACTOR_LU;
2455     b               = (Mat_SeqBAIJ*)(*fact)->data;
2456     if (!b->diag) {
2457       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2458     }
2459     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2460     b->row        = isrow;
2461     b->col        = iscol;
2462     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2463     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2464     b->icol       = isicol;
2465     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr);
2466   } else { /* general case perform the symbolic factorization */
2467     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2468     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2469 
2470     /* get new row pointers */
2471     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
2472     ainew[0] = 0;
2473     /* don't know how many column pointers are needed so estimate */
2474     jmax = (int)(f*ai[n] + 1);
2475     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
2476     /* ajfill is level of fill for each fill entry */
2477     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
2478     /* fill is a linked list of nonzeros in active row */
2479     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
2480     /* im is level for each filled value */
2481     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
2482     /* dloc is location of diagonal in factor */
2483     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
2484     dloc[0]  = 0;
2485     for (prow=0; prow<n; prow++) {
2486 
2487       /* copy prow into linked list */
2488       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2489       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2490       xi         = aj + ai[r[prow]];
2491       fill[n]    = n;
2492       fill[prow] = -1; /* marker for diagonal entry */
2493       while (nz--) {
2494 	fm  = n;
2495 	idx = ic[*xi++];
2496 	do {
2497 	  m  = fm;
2498 	  fm = fill[m];
2499 	} while (fm < idx);
2500 	fill[m]   = idx;
2501 	fill[idx] = fm;
2502 	im[idx]   = 0;
2503       }
2504 
2505       /* make sure diagonal entry is included */
2506       if (diagonal_fill && fill[prow] == -1) {
2507 	fm = n;
2508 	while (fill[fm] < prow) fm = fill[fm];
2509 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2510 	fill[fm]   = prow;
2511 	im[prow]   = 0;
2512 	nzf++;
2513 	dcount++;
2514       }
2515 
2516       nzi = 0;
2517       row = fill[n];
2518       while (row < prow) {
2519 	incrlev = im[row] + 1;
2520 	nz      = dloc[row];
2521 	xi      = ajnew  + ainew[row] + nz + 1;
2522 	flev    = ajfill + ainew[row] + nz + 1;
2523 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
2524 	fm      = row;
2525 	while (nnz-- > 0) {
2526 	  idx = *xi++;
2527 	  if (*flev + incrlev > levels) {
2528 	    flev++;
2529 	    continue;
2530 	  }
2531 	  do {
2532 	    m  = fm;
2533 	    fm = fill[m];
2534 	  } while (fm < idx);
2535 	  if (fm != idx) {
2536 	    im[idx]   = *flev + incrlev;
2537 	    fill[m]   = idx;
2538 	    fill[idx] = fm;
2539 	    fm        = idx;
2540 	    nzf++;
2541 	  } else {
2542 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2543 	  }
2544 	  flev++;
2545 	}
2546 	row = fill[row];
2547 	nzi++;
2548       }
2549       /* copy new filled row into permanent storage */
2550       ainew[prow+1] = ainew[prow] + nzf;
2551       if (ainew[prow+1] > jmax) {
2552 
2553 	/* estimate how much additional space we will need */
2554 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2555 	/* just double the memory each time */
2556 	int maxadd = jmax;
2557 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2558 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2559 	jmax += maxadd;
2560 
2561 	/* allocate a longer ajnew and ajfill */
2562 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2563 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2564 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
2565 	ajnew = xi;
2566 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2567 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2568 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
2569 	ajfill = xi;
2570 	realloc++; /* count how many reallocations are needed */
2571       }
2572       xi          = ajnew + ainew[prow];
2573       flev        = ajfill + ainew[prow];
2574       dloc[prow]  = nzi;
2575       fm          = fill[n];
2576       while (nzf--) {
2577 	*xi++   = fm;
2578 	*flev++ = im[fm];
2579 	fm      = fill[fm];
2580       }
2581       /* make sure row has diagonal entry */
2582       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2583 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2584     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2585       }
2586     }
2587     ierr = PetscFree(ajfill);CHKERRQ(ierr);
2588     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2589     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2590     ierr = PetscFree(fill);CHKERRQ(ierr);
2591     ierr = PetscFree(im);CHKERRQ(ierr);
2592 
2593     {
2594       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2595       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2596       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2597       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2598       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2599       if (diagonal_fill) {
2600 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2601       }
2602     }
2603 
2604     /* put together the new matrix */
2605     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2606     PetscLogObjectParent(*fact,isicol);
2607     b = (Mat_SeqBAIJ*)(*fact)->data;
2608     ierr = PetscFree(b->imax);CHKERRQ(ierr);
2609     b->singlemalloc = PETSC_FALSE;
2610     len = bs2*ainew[n]*sizeof(MatScalar);
2611     /* the next line frees the default space generated by the Create() */
2612     ierr = PetscFree(b->a);CHKERRQ(ierr);
2613     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
2614     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
2615     b->j          = ajnew;
2616     b->i          = ainew;
2617     for (i=0; i<n; i++) dloc[i] += ainew[i];
2618     b->diag       = dloc;
2619     b->ilen       = 0;
2620     b->imax       = 0;
2621     b->row        = isrow;
2622     b->col        = iscol;
2623     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2624     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2625     b->icol       = isicol;
2626     ierr = PetscMalloc((bs*n+bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr);
2627     /* In b structure:  Free imax, ilen, old a, old j.
2628        Allocate dloc, solve_work, new a, new j */
2629     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
2630     b->maxnz          = b->nz = ainew[n];
2631     (*fact)->factor   = FACTOR_LU;
2632 
2633     (*fact)->info.factor_mallocs    = realloc;
2634     (*fact)->info.fill_ratio_given  = f;
2635     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2636   }
2637 
2638   if (row_identity && col_identity) {
2639     switch (b->bs) {
2640       case 2:
2641         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2642         (*fact)->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2643         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
2644         break;
2645       case 3:
2646         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2647         (*fact)->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2648         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
2649         break;
2650       case 4:
2651 #if defined(PETSC_HAVE_SSE)
2652         PetscTruth sse_enabled;
2653         ierr = PetscSSEIsEnabled(&sse_enabled);CHKERRQ(ierr);
2654         if (sse_enabled) {
2655           (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2656         } else {
2657           (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2658         }
2659 #else
2660         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2661 #endif
2662         (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2663         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
2664         break;
2665       case 5:
2666         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2667         (*fact)->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
2668         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
2669         break;
2670       case 6:
2671         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2672         (*fact)->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
2673         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
2674         break;
2675       case 7:
2676         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2677         (*fact)->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
2678         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
2679       break;
2680     }
2681   }
2682   PetscFunctionReturn(0);
2683 }
2684 
2685 
2686 
2687 
2688