xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ac2a4f0d24b3b6a4ee93edbcad41f4bb9e923944)
1 /*$Id: baijfact2.c,v 1.29 1999/06/30 23:51:46 balay Exp bsmith $*/
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/ilu.h"
9 #include "src/inline/dot.h"
10 
11 /* ----------------------------------------------------------- */
12 #undef __FUNC__
13 #define __FUNC__ "MatSolve_SeqBAIJ_N"
14 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
17   IS              iscol=a->col,isrow=a->row;
18   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
19   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
20   MatScalar       *aa=a->a,*v;
21   Scalar          *x,*b,*sum,*tmp,*lsum;
22 
23   PetscFunctionBegin;
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26   tmp  = a->solve_work;
27 
28   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30 
31   /* forward solve the lower triangular */
32   ierr = PetscMemcpy(tmp,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
33   for ( i=1; i<n; i++ ) {
34     v   = aa + bs2*ai[i];
35     vi  = aj + ai[i];
36     nz  = a->diag[i] - ai[i];
37     sum = tmp + bs*i;
38     ierr = PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
39     while (nz--) {
40       Kernel_v_gets_v_minus_A_times_w(bs,sum,v,tmp+bs*(*vi++));
41       v += bs2;
42     }
43   }
44   /* backward solve the upper triangular */
45   lsum = a->solve_work + a->n;
46   for ( i=n-1; i>=0; i-- ){
47     v   = aa + bs2*(a->diag[i] + 1);
48     vi  = aj + a->diag[i] + 1;
49     nz  = ai[i+1] - a->diag[i] - 1;
50     ierr = PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
51     while (nz--) {
52       Kernel_v_gets_v_minus_A_times_w(bs,lsum,v,tmp+bs*(*vi++));
53       v += bs2;
54     }
55     Kernel_w_gets_A_times_v(bs,lsum,aa+bs2*a->diag[i],tmp+i*bs);
56     ierr = PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
57   }
58 
59   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
60   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
61   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
62   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
63   PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n);
64   PetscFunctionReturn(0);
65 }
66 
67 #undef __FUNC__
68 #define __FUNC__ "MatSolve_SeqBAIJ_7"
69 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
70 {
71   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
72   IS              iscol=a->col,isrow=a->row;
73   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
74   int             *diag = a->diag;
75   MatScalar       *aa=a->a,*v;
76   Scalar          sum1,sum2,sum3,sum4,sum5,sum6,sum7,x1,x2,x3,x4,x5,x6,x7;
77   Scalar          *x,*b,*tmp;
78 
79   PetscFunctionBegin;
80   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
81   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
82   tmp  = a->solve_work;
83 
84   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
85   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
86 
87   /* forward solve the lower triangular */
88   idx    = 7*(*r++);
89   tmp[0] = b[idx];   tmp[1] = b[1+idx];
90   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
91   tmp[5] = b[5+idx]; tmp[6] = b[6+idx];
92 
93   for ( i=1; i<n; i++ ) {
94     v     = aa + 49*ai[i];
95     vi    = aj + ai[i];
96     nz    = diag[i] - ai[i];
97     idx   = 7*(*r++);
98     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
99     sum5  = b[4+idx];sum6 = b[5+idx];sum7 = b[6+idx];
100     while (nz--) {
101       idx   = 7*(*vi++);
102       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
103       x4    = tmp[3+idx];x5 = tmp[4+idx];
104       x6    = tmp[5+idx];x7 = tmp[6+idx];
105       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
106       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
107       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
108       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
109       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
110       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
111       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
112       v += 49;
113     }
114     idx = 7*i;
115     tmp[idx]   = sum1;tmp[1+idx] = sum2;
116     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
117     tmp[5+idx] = sum6;tmp[6+idx] = sum7;
118   }
119   /* backward solve the upper triangular */
120   for ( i=n-1; i>=0; i-- ){
121     v    = aa + 49*diag[i] + 49;
122     vi   = aj + diag[i] + 1;
123     nz   = ai[i+1] - diag[i] - 1;
124     idt  = 7*i;
125     sum1 = tmp[idt];  sum2 = tmp[1+idt];
126     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
127     sum6 = tmp[5+idt];sum7 = tmp[6+idt];
128     while (nz--) {
129       idx   = 7*(*vi++);
130       x1    = tmp[idx];   x2 = tmp[1+idx];
131       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
132       x6    = tmp[5+idx]; x7 = tmp[6+idx];
133       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
134       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
135       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
136       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
137       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
138       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
139       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
140       v += 49;
141     }
142     idc = 7*(*c--);
143     v   = aa + 49*diag[i];
144     x[idc]   = tmp[idt]   = v[0]*sum1+v[7]*sum2+v[14]*sum3+
145                                  v[21]*sum4+v[28]*sum5+v[35]*sum6+v[42]*sum7;
146     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[8]*sum2+v[15]*sum3+
147                                  v[22]*sum4+v[29]*sum5+v[36]*sum6+v[43]*sum7;
148     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[9]*sum2+v[16]*sum3+
149                                  v[23]*sum4+v[30]*sum5+v[37]*sum6+v[44]*sum7;
150     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[10]*sum2+v[17]*sum3+
151                                  v[24]*sum4+v[31]*sum5+v[38]*sum6+v[45]*sum7;
152     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[11]*sum2+v[18]*sum3+
153                                  v[25]*sum4+v[32]*sum5+v[39]*sum6+v[46]*sum7;
154     x[5+idc] = tmp[5+idt] = v[5]*sum1+v[12]*sum2+v[19]*sum3+
155                                  v[26]*sum4+v[33]*sum5+v[40]*sum6+v[47]*sum7;
156     x[6+idc] = tmp[6+idt] = v[6]*sum1+v[13]*sum2+v[20]*sum3+
157                                  v[27]*sum4+v[34]*sum5+v[41]*sum6+v[48]*sum7;
158   }
159 
160   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
161   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
162   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164   PLogFlops(2*49*(a->nz) - 7*a->n);
165   PetscFunctionReturn(0);
166 }
167 
168 #undef __FUNC__
169 #define __FUNC__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
170 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
171 {
172   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
173   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
174   int             ierr,*diag = a->diag,jdx;
175   MatScalar       *aa=a->a,*v;
176   Scalar          *x,*b,sum1,sum2,sum3,sum4,sum5,sum6,sum7,x1,x2,x3,x4,x5,x6,x7;
177 
178   PetscFunctionBegin;
179   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
180   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
181   /* forward solve the lower triangular */
182   idx    = 0;
183   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
184   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
185   x[6] = b[6+idx];
186   for ( i=1; i<n; i++ ) {
187     v     =  aa + 49*ai[i];
188     vi    =  aj + ai[i];
189     nz    =  diag[i] - ai[i];
190     idx   =  7*i;
191     sum1  =  b[idx];   sum2 = b[1+idx]; sum3 = b[2+idx];
192     sum4  =  b[3+idx]; sum5 = b[4+idx]; sum6 = b[5+idx];
193     sum7  =  b[6+idx];
194     while (nz--) {
195       jdx   = 7*(*vi++);
196       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
197       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
198       x7    = x[6+jdx];
199       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
200       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
201       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
202       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
203       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
204       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
205       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
206       v += 49;
207      }
208     x[idx]   = sum1;
209     x[1+idx] = sum2;
210     x[2+idx] = sum3;
211     x[3+idx] = sum4;
212     x[4+idx] = sum5;
213     x[5+idx] = sum6;
214     x[6+idx] = sum7;
215   }
216   /* backward solve the upper triangular */
217   for ( i=n-1; i>=0; i-- ){
218     v    = aa + 49*diag[i] + 49;
219     vi   = aj + diag[i] + 1;
220     nz   = ai[i+1] - diag[i] - 1;
221     idt  = 7*i;
222     sum1 = x[idt];   sum2 = x[1+idt];
223     sum3 = x[2+idt]; sum4 = x[3+idt];
224     sum5 = x[4+idt]; sum6 = x[5+idt];
225     sum7 = x[6+idt];
226     while (nz--) {
227       idx   = 7*(*vi++);
228       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
229       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
230       x7    = x[6+idx];
231       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
232       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
233       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
234       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
235       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
236       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
237       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
238       v += 49;
239     }
240     v        = aa + 49*diag[i];
241     x[idt]   = v[0]*sum1 + v[7]*sum2  + v[14]*sum3 + v[21]*sum4
242                          + v[28]*sum5 + v[35]*sum6 + v[42]*sum7;
243     x[1+idt] = v[1]*sum1 + v[8]*sum2  + v[15]*sum3 + v[22]*sum4
244                          + v[29]*sum5 + v[36]*sum6 + v[43]*sum7;
245     x[2+idt] = v[2]*sum1 + v[9]*sum2  + v[16]*sum3 + v[23]*sum4
246                          + v[30]*sum5 + v[37]*sum6 + v[44]*sum7;
247     x[3+idt] = v[3]*sum1 + v[10]*sum2  + v[17]*sum3 + v[24]*sum4
248                          + v[31]*sum5 + v[38]*sum6 + v[45]*sum7;
249     x[4+idt] = v[4]*sum1 + v[11]*sum2  + v[18]*sum3 + v[25]*sum4
250                          + v[32]*sum5 + v[39]*sum6 + v[46]*sum7;
251     x[5+idt] = v[5]*sum1 + v[12]*sum2  + v[19]*sum3 + v[26]*sum4
252                          + v[33]*sum5 + v[40]*sum6 + v[47]*sum7;
253     x[6+idt] = v[6]*sum1 + v[13]*sum2  + v[20]*sum3 + v[27]*sum4
254                          + v[34]*sum5 + v[41]*sum6 + v[48]*sum7;
255   }
256 
257   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
258   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
259   PLogFlops(2*36*(a->nz) - 6*a->n);
260   PetscFunctionReturn(0);
261 }
262 
263 #undef __FUNC__
264 #define __FUNC__ "MatSolve_SeqBAIJ_6"
265 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
266 {
267   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
268   IS              iscol=a->col,isrow=a->row;
269   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
270   int             *diag = a->diag;
271   MatScalar       *aa=a->a,*v;
272   Scalar          *x,*b,sum1,sum2,sum3,sum4,sum5,sum6,x1,x2,x3,x4,x5,x6,*tmp;
273 
274   PetscFunctionBegin;
275   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
276   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277   tmp  = a->solve_work;
278 
279   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
280   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
281 
282   /* forward solve the lower triangular */
283   idx    = 6*(*r++);
284   tmp[0] = b[idx];   tmp[1] = b[1+idx];
285   tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
286   tmp[4] = b[4+idx]; tmp[5] = b[5+idx];
287   for ( i=1; i<n; i++ ) {
288     v     = aa + 36*ai[i];
289     vi    = aj + ai[i];
290     nz    = diag[i] - ai[i];
291     idx   = 6*(*r++);
292     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
293     sum5  = b[4+idx]; sum6 = b[5+idx];
294     while (nz--) {
295       idx   = 6*(*vi++);
296       x1    = tmp[idx];   x2 = tmp[1+idx]; x3 = tmp[2+idx];
297       x4    = tmp[3+idx]; x5 = tmp[4+idx]; x6 = tmp[5+idx];
298       sum1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
299       sum2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
300       sum3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
301       sum4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
302       sum5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
303       sum6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
304       v += 36;
305     }
306     idx = 6*i;
307     tmp[idx]   = sum1;tmp[1+idx] = sum2;
308     tmp[2+idx] = sum3;tmp[3+idx] = sum4;
309     tmp[4+idx] = sum5;tmp[5+idx] = sum6;
310   }
311   /* backward solve the upper triangular */
312   for ( i=n-1; i>=0; i-- ){
313     v    = aa + 36*diag[i] + 36;
314     vi   = aj + diag[i] + 1;
315     nz   = ai[i+1] - diag[i] - 1;
316     idt  = 6*i;
317     sum1 = tmp[idt];  sum2 = tmp[1+idt];
318     sum3 = tmp[2+idt];sum4 = tmp[3+idt];
319     sum5 = tmp[4+idt];sum6 = tmp[5+idt];
320     while (nz--) {
321       idx   = 6*(*vi++);
322       x1    = tmp[idx];   x2 = tmp[1+idx];
323       x3    = tmp[2+idx]; x4 = tmp[3+idx];
324       x5    = tmp[4+idx]; x6 = tmp[5+idx];
325       sum1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
326       sum2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
327       sum3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
328       sum4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
329       sum5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
330       sum6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
331       v += 36;
332     }
333     idc = 6*(*c--);
334     v   = aa + 36*diag[i];
335     x[idc]   = tmp[idt]   = v[0]*sum1+v[6]*sum2+v[12]*sum3+
336                                  v[18]*sum4+v[24]*sum5+v[30]*sum6;
337     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[7]*sum2+v[13]*sum3+
338                                  v[19]*sum4+v[25]*sum5+v[31]*sum6;
339     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[8]*sum2+v[14]*sum3+
340                                  v[20]*sum4+v[26]*sum5+v[32]*sum6;
341     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[9]*sum2+v[15]*sum3+
342                                  v[21]*sum4+v[27]*sum5+v[33]*sum6;
343     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[10]*sum2+v[16]*sum3+
344                                  v[22]*sum4+v[28]*sum5+v[34]*sum6;
345     x[5+idc] = tmp[5+idt] = v[5]*sum1+v[11]*sum2+v[17]*sum3+
346                                  v[23]*sum4+v[29]*sum5+v[35]*sum6;
347   }
348 
349   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
350   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
351   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
352   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
353   PLogFlops(2*36*(a->nz) - 6*a->n);
354   PetscFunctionReturn(0);
355 }
356 
357 #undef __FUNC__
358 #define __FUNC__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
359 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
360 {
361   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
362   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
363   int             ierr,*diag = a->diag,jdx;
364   MatScalar       *aa=a->a,*v;
365   Scalar          *x,*b,sum1,sum2,sum3,sum4,sum5,sum6,x1,x2,x3,x4,x5,x6;
366 
367   PetscFunctionBegin;
368   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
369   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
370   /* forward solve the lower triangular */
371   idx    = 0;
372   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
373   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
374   for ( i=1; i<n; i++ ) {
375     v     =  aa + 36*ai[i];
376     vi    =  aj + ai[i];
377     nz    =  diag[i] - ai[i];
378     idx   =  6*i;
379     sum1  =  b[idx];   sum2 = b[1+idx]; sum3 = b[2+idx];
380     sum4  =  b[3+idx]; sum5 = b[4+idx]; sum6 = b[5+idx];
381     while (nz--) {
382       jdx   = 6*(*vi++);
383       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
384       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
385       sum1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
386       sum2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
387       sum3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
388       sum4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
389       sum5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
390       sum6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
391       v += 36;
392      }
393     x[idx]   = sum1;
394     x[1+idx] = sum2;
395     x[2+idx] = sum3;
396     x[3+idx] = sum4;
397     x[4+idx] = sum5;
398     x[5+idx] = sum6;
399   }
400   /* backward solve the upper triangular */
401   for ( i=n-1; i>=0; i-- ){
402     v    = aa + 36*diag[i] + 36;
403     vi   = aj + diag[i] + 1;
404     nz   = ai[i+1] - diag[i] - 1;
405     idt  = 6*i;
406     sum1 = x[idt];   sum2 = x[1+idt];
407     sum3 = x[2+idt]; sum4 = x[3+idt];
408     sum5 = x[4+idt]; sum6 = x[5+idt];
409     while (nz--) {
410       idx   = 6*(*vi++);
411       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
412       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
413       sum1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
414       sum2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
415       sum3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
416       sum4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
417       sum5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
418       sum6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
419       v += 36;
420     }
421     v        = aa + 36*diag[i];
422     x[idt]   = v[0]*sum1 + v[6]*sum2  + v[12]*sum3 + v[18]*sum4 + v[24]*sum5 + v[30]*sum6;
423     x[1+idt] = v[1]*sum1 + v[7]*sum2  + v[13]*sum3 + v[19]*sum4 + v[25]*sum5 + v[31]*sum6;
424     x[2+idt] = v[2]*sum1 + v[8]*sum2  + v[14]*sum3 + v[20]*sum4 + v[26]*sum5 + v[32]*sum6;
425     x[3+idt] = v[3]*sum1 + v[9]*sum2  + v[15]*sum3 + v[21]*sum4 + v[27]*sum5 + v[33]*sum6;
426     x[4+idt] = v[4]*sum1 + v[10]*sum2 + v[16]*sum3 + v[22]*sum4 + v[28]*sum5 + v[34]*sum6;
427     x[5+idt] = v[5]*sum1 + v[11]*sum2 + v[17]*sum3 + v[23]*sum4 + v[29]*sum5 + v[35]*sum6;
428   }
429 
430   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
431   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
432   PLogFlops(2*36*(a->nz) - 6*a->n);
433   PetscFunctionReturn(0);
434 }
435 
436 #undef __FUNC__
437 #define __FUNC__ "MatSolve_SeqBAIJ_5"
438 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
439 {
440   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
441   IS              iscol=a->col,isrow=a->row;
442   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
443   int             *diag = a->diag;
444   MatScalar       *aa=a->a,*v;
445   Scalar          *x,*b,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*tmp;
446 
447   PetscFunctionBegin;
448   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
450   tmp  = a->solve_work;
451 
452   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
453   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
454 
455   /* forward solve the lower triangular */
456   idx    = 5*(*r++);
457   tmp[0] = b[idx];   tmp[1] = b[1+idx];
458   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
459   for ( i=1; i<n; i++ ) {
460     v     = aa + 25*ai[i];
461     vi    = aj + ai[i];
462     nz    = diag[i] - ai[i];
463     idx   = 5*(*r++);
464     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
465     sum5  = b[4+idx];
466     while (nz--) {
467       idx   = 5*(*vi++);
468       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
469       x4    = tmp[3+idx];x5 = tmp[4+idx];
470       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
471       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
472       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
473       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
474       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
475       v += 25;
476     }
477     idx = 5*i;
478     tmp[idx]   = sum1;tmp[1+idx] = sum2;
479     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
480   }
481   /* backward solve the upper triangular */
482   for ( i=n-1; i>=0; i-- ){
483     v    = aa + 25*diag[i] + 25;
484     vi   = aj + diag[i] + 1;
485     nz   = ai[i+1] - diag[i] - 1;
486     idt  = 5*i;
487     sum1 = tmp[idt];  sum2 = tmp[1+idt];
488     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
489     while (nz--) {
490       idx   = 5*(*vi++);
491       x1    = tmp[idx];   x2 = tmp[1+idx];
492       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
493       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
494       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
495       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
496       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
497       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
498       v += 25;
499     }
500     idc = 5*(*c--);
501     v   = aa + 25*diag[i];
502     x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
503                                  v[15]*sum4+v[20]*sum5;
504     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
505                                  v[16]*sum4+v[21]*sum5;
506     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
507                                  v[17]*sum4+v[22]*sum5;
508     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
509                                  v[18]*sum4+v[23]*sum5;
510     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
511                                  v[19]*sum4+v[24]*sum5;
512   }
513 
514   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
515   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
516   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518   PLogFlops(2*25*(a->nz) - 5*a->n);
519   PetscFunctionReturn(0);
520 }
521 
522 #undef __FUNC__
523 #define __FUNC__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
524 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
525 {
526   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
527   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
528   int             ierr,*diag = a->diag,jdx;
529   MatScalar       *aa=a->a,*v;
530   Scalar          *x,*b,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;;
531 
532   PetscFunctionBegin;
533   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
534   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
535   /* forward solve the lower triangular */
536   idx    = 0;
537   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
538   for ( i=1; i<n; i++ ) {
539     v     =  aa + 25*ai[i];
540     vi    =  aj + ai[i];
541     nz    =  diag[i] - ai[i];
542     idx   =  5*i;
543     sum1  =  b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];sum5 = b[4+idx];
544     while (nz--) {
545       jdx   = 5*(*vi++);
546       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
547       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
548       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
549       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
550       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
551       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
552       v    += 25;
553     }
554     x[idx]   = sum1;
555     x[1+idx] = sum2;
556     x[2+idx] = sum3;
557     x[3+idx] = sum4;
558     x[4+idx] = sum5;
559   }
560   /* backward solve the upper triangular */
561   for ( i=n-1; i>=0; i-- ){
562     v    = aa + 25*diag[i] + 25;
563     vi   = aj + diag[i] + 1;
564     nz   = ai[i+1] - diag[i] - 1;
565     idt  = 5*i;
566     sum1 = x[idt];  sum2 = x[1+idt];
567     sum3 = x[2+idt];sum4 = x[3+idt]; sum5 = x[4+idt];
568     while (nz--) {
569       idx   = 5*(*vi++);
570       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
571       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
572       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
573       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
574       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
575       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
576       v    += 25;
577     }
578     v        = aa + 25*diag[i];
579     x[idt]   = v[0]*sum1 + v[5]*sum2 + v[10]*sum3  + v[15]*sum4 + v[20]*sum5;
580     x[1+idt] = v[1]*sum1 + v[6]*sum2 + v[11]*sum3  + v[16]*sum4 + v[21]*sum5;
581     x[2+idt] = v[2]*sum1 + v[7]*sum2 + v[12]*sum3  + v[17]*sum4 + v[22]*sum5;
582     x[3+idt] = v[3]*sum1 + v[8]*sum2 + v[13]*sum3  + v[18]*sum4 + v[23]*sum5;
583     x[4+idt] = v[4]*sum1 + v[9]*sum2 + v[14]*sum3  + v[19]*sum4 + v[24]*sum5;
584   }
585 
586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588   PLogFlops(2*25*(a->nz) - 5*a->n);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNC__
593 #define __FUNC__ "MatSolve_SeqBAIJ_4"
594 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
595 {
596   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
597   IS              iscol=a->col,isrow=a->row;
598   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
599   int             *diag = a->diag;
600   MatScalar       *aa=a->a,*v;
601   Scalar          *x,*b,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*tmp;
602 
603   PetscFunctionBegin;
604   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
605   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
606   tmp  = a->solve_work;
607 
608   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
609   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
610 
611   /* forward solve the lower triangular */
612   idx    = 4*(*r++);
613   tmp[0] = b[idx];   tmp[1] = b[1+idx];
614   tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
615   for ( i=1; i<n; i++ ) {
616     v     = aa + 16*ai[i];
617     vi    = aj + ai[i];
618     nz    = diag[i] - ai[i];
619     idx   = 4*(*r++);
620     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
621     while (nz--) {
622       idx   = 4*(*vi++);
623       x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
624       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
625       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
626       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
627       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
628       v    += 16;
629     }
630     idx        = 4*i;
631     tmp[idx]   = sum1;tmp[1+idx] = sum2;
632     tmp[2+idx] = sum3;tmp[3+idx] = sum4;
633   }
634   /* backward solve the upper triangular */
635   for ( i=n-1; i>=0; i-- ){
636     v    = aa + 16*diag[i] + 16;
637     vi   = aj + diag[i] + 1;
638     nz   = ai[i+1] - diag[i] - 1;
639     idt  = 4*i;
640     sum1 = tmp[idt];  sum2 = tmp[1+idt];
641     sum3 = tmp[2+idt];sum4 = tmp[3+idt];
642     while (nz--) {
643       idx   = 4*(*vi++);
644       x1    = tmp[idx];   x2 = tmp[1+idx];
645       x3    = tmp[2+idx]; x4 = tmp[3+idx];
646       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
647       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
648       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
649       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
650       v += 16;
651     }
652     idc      = 4*(*c--);
653     v        = aa + 16*diag[i];
654     x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
655     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
656     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
657     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
658   }
659 
660   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
661   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
662   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
663   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
664   PLogFlops(2*16*(a->nz) - 4*a->n);
665   PetscFunctionReturn(0);
666 }
667 
668 
669 /*
670       Special case where the matrix was ILU(0) factored in the natural
671    ordering. This eliminates the need for the column and row permutation.
672 */
673 #undef __FUNC__
674 #define __FUNC__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
675 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
676 {
677   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
678   int             n=a->mbs,*ai=a->i,*aj=a->j;
679   int             ierr,*diag = a->diag;
680   MatScalar       *aa=a->a;
681   Scalar          *x,*b;
682 
683   PetscFunctionBegin;
684   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
685   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
686 
687 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
688   {
689     static Scalar w[2000]; /* very BAD need to fix */
690     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
691   }
692 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
693   {
694     static Scalar w[2000]; /* very BAD need to fix */
695     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
696   }
697 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
698   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
699 #else
700   {
701     Scalar    sum1,sum2,sum3,sum4,x1,x2,x3,x4;
702     MatScalar *v;
703     int       jdx,idt,idx,nz,*vi,i;
704 
705   /* forward solve the lower triangular */
706   idx    = 0;
707   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
708   for ( i=1; i<n; i++ ) {
709     v     =  aa      + 16*ai[i];
710     vi    =  aj      + ai[i];
711     nz    =  diag[i] - ai[i];
712     idx   +=  4;
713     sum1  =  b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
714     while (nz--) {
715       jdx   = 4*(*vi++);
716       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
717       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
718       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
719       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
720       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
721       v    += 16;
722     }
723     x[idx]   = sum1;
724     x[1+idx] = sum2;
725     x[2+idx] = sum3;
726     x[3+idx] = sum4;
727   }
728   /* backward solve the upper triangular */
729   for ( i=n-1; i>=0; i-- ){
730     v    = aa + 16*diag[i] + 16;
731     vi   = aj + diag[i] + 1;
732     nz   = ai[i+1] - diag[i] - 1;
733     idt  = 4*i;
734     sum1 = x[idt];  sum2 = x[1+idt];
735     sum3 = x[2+idt];sum4 = x[3+idt];
736     while (nz--) {
737       idx   = 4*(*vi++);
738       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
739       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
740       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
741       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
742       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
743       v    += 16;
744     }
745     v        = aa + 16*diag[i];
746     x[idt]   = v[0]*sum1 + v[4]*sum2 + v[8]*sum3  + v[12]*sum4;
747     x[1+idt] = v[1]*sum1 + v[5]*sum2 + v[9]*sum3  + v[13]*sum4;
748     x[2+idt] = v[2]*sum1 + v[6]*sum2 + v[10]*sum3 + v[14]*sum4;
749     x[3+idt] = v[3]*sum1 + v[7]*sum2 + v[11]*sum3 + v[15]*sum4;
750   }
751   }
752 #endif
753 
754   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
755   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
756   PLogFlops(2*16*(a->nz) - 4*a->n);
757   PetscFunctionReturn(0);
758 }
759 
760 #undef __FUNC__
761 #define __FUNC__ "MatSolve_SeqBAIJ_3"
762 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
763 {
764   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
765   IS              iscol=a->col,isrow=a->row;
766   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
767   int             *diag = a->diag;
768   MatScalar       *aa=a->a,*v;
769   Scalar          *x,*b,sum1,sum2,sum3,x1,x2,x3,*tmp;
770 
771   PetscFunctionBegin;
772   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
773   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
774   tmp  = a->solve_work;
775 
776   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
777   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
778 
779   /* forward solve the lower triangular */
780   idx    = 3*(*r++);
781   tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
782   for ( i=1; i<n; i++ ) {
783     v     = aa + 9*ai[i];
784     vi    = aj + ai[i];
785     nz    = diag[i] - ai[i];
786     idx   = 3*(*r++);
787     sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
788     while (nz--) {
789       idx   = 3*(*vi++);
790       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
791       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
792       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
793       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
794       v += 9;
795     }
796     idx = 3*i;
797     tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
798   }
799   /* backward solve the upper triangular */
800   for ( i=n-1; i>=0; i-- ){
801     v    = aa + 9*diag[i] + 9;
802     vi   = aj + diag[i] + 1;
803     nz   = ai[i+1] - diag[i] - 1;
804     idt  = 3*i;
805     sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
806     while (nz--) {
807       idx   = 3*(*vi++);
808       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
809       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
810       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
811       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
812       v += 9;
813     }
814     idc = 3*(*c--);
815     v   = aa + 9*diag[i];
816     x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
817     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
818     x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
819   }
820   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
821   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
822   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
823   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
824   PLogFlops(2*9*(a->nz) - 3*a->n);
825   PetscFunctionReturn(0);
826 }
827 
828 /*
829       Special case where the matrix was ILU(0) factored in the natural
830    ordering. This eliminates the need for the column and row permutation.
831 */
832 #undef __FUNC__
833 #define __FUNC__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
834 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
835 {
836   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
837   int             n=a->mbs,*ai=a->i,*aj=a->j;
838   int             ierr,*diag = a->diag;
839   MatScalar       *aa=a->a, *v;
840   Scalar          *x,*b,sum1,sum2,sum3,x1,x2,x3;
841   int             jdx,idt,idx,nz,*vi,i;
842 
843   PetscFunctionBegin;
844   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
845   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
846 
847 
848   /* forward solve the lower triangular */
849   idx    = 0;
850   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
851   for ( i=1; i<n; i++ ) {
852     v     =  aa      + 9*ai[i];
853     vi    =  aj      + ai[i];
854     nz    =  diag[i] - ai[i];
855     idx   +=  3;
856     sum1  =  b[idx];sum2 = b[1+idx];sum3 = b[2+idx];
857     while (nz--) {
858       jdx   = 3*(*vi++);
859       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
860       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
861       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
862       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
863       v    += 9;
864     }
865     x[idx]   = sum1;
866     x[1+idx] = sum2;
867     x[2+idx] = sum3;
868   }
869   /* backward solve the upper triangular */
870   for ( i=n-1; i>=0; i-- ){
871     v    = aa + 9*diag[i] + 9;
872     vi   = aj + diag[i] + 1;
873     nz   = ai[i+1] - diag[i] - 1;
874     idt  = 3*i;
875     sum1 = x[idt];  sum2 = x[1+idt];
876     sum3 = x[2+idt];
877     while (nz--) {
878       idx   = 3*(*vi++);
879       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
880       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
881       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
882       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
883       v    += 9;
884     }
885     v        = aa +  9*diag[i];
886     x[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
887     x[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
888     x[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
889   }
890 
891   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   PLogFlops(2*9*(a->nz) - 3*a->n);
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNC__
898 #define __FUNC__ "MatSolve_SeqBAIJ_2"
899 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
902   IS              iscol=a->col,isrow=a->row;
903   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
904   int             *diag = a->diag;
905   MatScalar       *aa=a->a,*v;
906   Scalar          *x,*b,sum1,sum2,x1,x2,*tmp;
907 
908   PetscFunctionBegin;
909   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
910   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
911   tmp  = a->solve_work;
912 
913   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
914   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
915 
916   /* forward solve the lower triangular */
917   idx    = 2*(*r++);
918   tmp[0] = b[idx]; tmp[1] = b[1+idx];
919   for ( i=1; i<n; i++ ) {
920     v     = aa + 4*ai[i];
921     vi    = aj + ai[i];
922     nz    = diag[i] - ai[i];
923     idx   = 2*(*r++);
924     sum1  = b[idx]; sum2 = b[1+idx];
925     while (nz--) {
926       idx   = 2*(*vi++);
927       x1    = tmp[idx]; x2 = tmp[1+idx];
928       sum1 -= v[0]*x1 + v[2]*x2;
929       sum2 -= v[1]*x1 + v[3]*x2;
930       v += 4;
931     }
932     idx = 2*i;
933     tmp[idx] = sum1; tmp[1+idx] = sum2;
934   }
935   /* backward solve the upper triangular */
936   for ( i=n-1; i>=0; i-- ){
937     v    = aa + 4*diag[i] + 4;
938     vi   = aj + diag[i] + 1;
939     nz   = ai[i+1] - diag[i] - 1;
940     idt  = 2*i;
941     sum1 = tmp[idt]; sum2 = tmp[1+idt];
942     while (nz--) {
943       idx   = 2*(*vi++);
944       x1    = tmp[idx]; x2 = tmp[1+idx];
945       sum1 -= v[0]*x1 + v[2]*x2;
946       sum2 -= v[1]*x1 + v[3]*x2;
947       v += 4;
948     }
949     idc = 2*(*c--);
950     v   = aa + 4*diag[i];
951     x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
952     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
953   }
954   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
955   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
956   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
958   PLogFlops(2*4*(a->nz) - 2*a->n);
959   PetscFunctionReturn(0);
960 }
961 
962 /*
963       Special case where the matrix was ILU(0) factored in the natural
964    ordering. This eliminates the need for the column and row permutation.
965 */
966 #undef __FUNC__
967 #define __FUNC__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
968 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
969 {
970   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
971   int             n=a->mbs,*ai=a->i,*aj=a->j;
972   int             ierr,*diag = a->diag;
973   MatScalar       *aa=a->a,*v;
974   Scalar          *x,*b,sum1,sum2,x1,x2;
975   int             jdx,idt,idx,nz,*vi,i;
976 
977   PetscFunctionBegin;
978   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
980 
981   /* forward solve the lower triangular */
982   idx    = 0;
983   x[0]   = b[0]; x[1] = b[1];
984   for ( i=1; i<n; i++ ) {
985     v     =  aa      + 4*ai[i];
986     vi    =  aj      + ai[i];
987     nz    =  diag[i] - ai[i];
988     idx   +=  2;
989     sum1  =  b[idx];sum2 = b[1+idx];
990     while (nz--) {
991       jdx   = 2*(*vi++);
992       x1    = x[jdx];x2 = x[1+jdx];
993       sum1 -= v[0]*x1 + v[2]*x2;
994       sum2 -= v[1]*x1 + v[3]*x2;
995       v    += 4;
996     }
997     x[idx]   = sum1;
998     x[1+idx] = sum2;
999   }
1000   /* backward solve the upper triangular */
1001   for ( i=n-1; i>=0; i-- ){
1002     v    = aa + 4*diag[i] + 4;
1003     vi   = aj + diag[i] + 1;
1004     nz   = ai[i+1] - diag[i] - 1;
1005     idt  = 2*i;
1006     sum1 = x[idt];  sum2 = x[1+idt];
1007     while (nz--) {
1008       idx   = 2*(*vi++);
1009       x1    = x[idx];   x2 = x[1+idx];
1010       sum1 -= v[0]*x1 + v[2]*x2;
1011       sum2 -= v[1]*x1 + v[3]*x2;
1012       v    += 4;
1013     }
1014     v        = aa +  4*diag[i];
1015     x[idt]   = v[0]*sum1 + v[2]*sum2;
1016     x[1+idt] = v[1]*sum1 + v[3]*sum2;
1017   }
1018 
1019   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1020   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1021   PLogFlops(2*4*(a->nz) - 2*a->n);
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 #undef __FUNC__
1026 #define __FUNC__ "MatSolve_SeqBAIJ_1"
1027 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1028 {
1029   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1030   IS              iscol=a->col,isrow=a->row;
1031   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
1032   int             *diag = a->diag;
1033   MatScalar       *aa=a->a,*v;
1034   Scalar          *x,*b,sum1,*tmp;
1035 
1036   PetscFunctionBegin;
1037   if (!n) PetscFunctionReturn(0);
1038 
1039   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1041   tmp  = a->solve_work;
1042 
1043   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1044   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1045 
1046   /* forward solve the lower triangular */
1047   tmp[0] = b[*r++];
1048   for ( i=1; i<n; i++ ) {
1049     v     = aa + ai[i];
1050     vi    = aj + ai[i];
1051     nz    = diag[i] - ai[i];
1052     sum1  = b[*r++];
1053     while (nz--) {
1054       sum1 -= (*v++)*tmp[*vi++];
1055     }
1056     tmp[i] = sum1;
1057   }
1058   /* backward solve the upper triangular */
1059   for ( i=n-1; i>=0; i-- ){
1060     v    = aa + diag[i] + 1;
1061     vi   = aj + diag[i] + 1;
1062     nz   = ai[i+1] - diag[i] - 1;
1063     sum1 = tmp[i];
1064     while (nz--) {
1065       sum1 -= (*v++)*tmp[*vi++];
1066     }
1067     x[*c--] = tmp[i] = aa[diag[i]]*sum1;
1068   }
1069 
1070   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1071   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1072   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1073   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1074   PLogFlops(2*1*(a->nz) - a->n);
1075   PetscFunctionReturn(0);
1076 }
1077 /*
1078       Special case where the matrix was ILU(0) factored in the natural
1079    ordering. This eliminates the need for the column and row permutation.
1080 */
1081 #undef __FUNC__
1082 #define __FUNC__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
1083 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1084 {
1085   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1086   int             n=a->mbs,*ai=a->i,*aj=a->j;
1087   int             ierr,*diag = a->diag;
1088   MatScalar       *aa=a->a;
1089   Scalar          *x,*b;
1090   Scalar          sum1,x1;
1091   MatScalar       *v;
1092   int             jdx,idt,idx,nz,*vi,i;
1093 
1094   PetscFunctionBegin;
1095   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1096   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1097 
1098   /* forward solve the lower triangular */
1099   idx    = 0;
1100   x[0]   = b[0];
1101   for ( i=1; i<n; i++ ) {
1102     v     =  aa      + ai[i];
1103     vi    =  aj      + ai[i];
1104     nz    =  diag[i] - ai[i];
1105     idx   +=  1;
1106     sum1  =  b[idx];
1107     while (nz--) {
1108       jdx   = *vi++;
1109       x1    = x[jdx];
1110       sum1 -= v[0]*x1;
1111       v    += 1;
1112     }
1113     x[idx]   = sum1;
1114   }
1115   /* backward solve the upper triangular */
1116   for ( i=n-1; i>=0; i-- ){
1117     v    = aa + diag[i] + 1;
1118     vi   = aj + diag[i] + 1;
1119     nz   = ai[i+1] - diag[i] - 1;
1120     idt  = i;
1121     sum1 = x[idt];
1122     while (nz--) {
1123       idx   = *vi++;
1124       x1    = x[idx];
1125       sum1 -= v[0]*x1;
1126       v    += 1;
1127     }
1128     v        = aa +  diag[i];
1129     x[idt]   = v[0]*sum1;
1130   }
1131   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1133   PLogFlops(2*(a->nz) - a->n);
1134   PetscFunctionReturn(0);
1135 }
1136 
1137 /* ----------------------------------------------------------------*/
1138 /*
1139      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
1140    except that the data structure of Mat_SeqAIJ is slightly different.
1141    Not a good example of code reuse.
1142 */
1143 extern int MatMissingDiag_SeqBAIJ(Mat);
1144 
1145 #undef __FUNC__
1146 #define __FUNC__ "MatILUFactorSymbolic_SeqBAIJ"
1147 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
1148 {
1149   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
1150   IS          isicol;
1151   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
1152   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
1153   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0, dcount = 0;
1154   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2, levels, diagonal_fill;
1155   PetscTruth  col_identity, row_identity;
1156   double      f;
1157 
1158   PetscFunctionBegin;
1159   if (info) {
1160     f             = info->fill;
1161     levels        = (int) info->levels;
1162     diagonal_fill = (int) info->diagonal_fill;
1163   } else {
1164     f             = 1.0;
1165     levels        = 0;
1166     diagonal_fill = 0;
1167   }
1168   ierr = ISInvertPermutation(iscol,&isicol);CHKERRQ(ierr);
1169 
1170   /* special case that simply copies fill pattern */
1171   PetscValidHeaderSpecific(isrow,IS_COOKIE);
1172   PetscValidHeaderSpecific(iscol,IS_COOKIE);
1173   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1174   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
1175   if (levels == 0 && row_identity && col_identity) {
1176     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
1177     (*fact)->factor = FACTOR_LU;
1178     b               = (Mat_SeqBAIJ *) (*fact)->data;
1179     if (!b->diag) {
1180       ierr = MatMarkDiag_SeqBAIJ(*fact);CHKERRQ(ierr);
1181     }
1182     ierr = MatMissingDiag_SeqBAIJ(*fact);CHKERRQ(ierr);
1183     b->row        = isrow;
1184     b->col        = iscol;
1185     b->icol       = isicol;
1186     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
1187    /*
1188         Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
1189         for ILU(0) factorization with natural ordering
1190    */
1191     switch (b->bs) {
1192     case 2:
1193       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
1194       (*fact)->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
1195       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1196       break;
1197     case 3:
1198       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
1199       (*fact)->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
1200       PLogInfo(A,"UMatILUFactorSymbolic_SeqBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1201       break;
1202     case 4:
1203       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
1204       (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
1205       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1206       break;
1207     case 5:
1208       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
1209       (*fact)->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
1210       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1211       break;
1212     case 6:
1213       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
1214       (*fact)->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
1215       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1216       break;
1217     case 7:
1218       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
1219       (*fact)->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
1220       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1221     break;
1222   }
1223     PetscFunctionReturn(0);
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   /* get new row pointers */
1230   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) );CHKPTRQ(ainew);
1231   ainew[0] = 0;
1232   /* don't know how many column pointers are needed so estimate */
1233   jmax = (int) (f*ai[n] + 1);
1234   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) );CHKPTRQ(ajnew);
1235   /* ajfill is level of fill for each fill entry */
1236   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) );CHKPTRQ(ajfill);
1237   /* fill is a linked list of nonzeros in active row */
1238   fill = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(fill);
1239   /* im is level for each filled value */
1240   im = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(im);
1241   /* dloc is location of diagonal in factor */
1242   dloc = (int *) PetscMalloc( (n+1)*sizeof(int));CHKPTRQ(dloc);
1243   dloc[0]  = 0;
1244   for ( prow=0; prow<n; prow++ ) {
1245 
1246     /* copy prow into linked list */
1247     nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
1248     if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix");
1249     xi         = aj + ai[r[prow]];
1250     fill[n]    = n;
1251     fill[prow] = -1; /* marker for diagonal entry */
1252     while (nz--) {
1253       fm  = n;
1254       idx = ic[*xi++];
1255       do {
1256         m  = fm;
1257         fm = fill[m];
1258       } while (fm < idx);
1259       fill[m]   = idx;
1260       fill[idx] = fm;
1261       im[idx]   = 0;
1262     }
1263 
1264     /* make sure diagonal entry is included */
1265     if (diagonal_fill && fill[prow] == -1) {
1266       fm = n;
1267       while (fill[fm] < prow) fm = fill[fm];
1268       fill[prow] = fill[fm];  /* insert diagonal into linked list */
1269       fill[fm]   = prow;
1270       im[prow]   = 0;
1271       nzf++;
1272       dcount++;
1273     }
1274 
1275     nzi = 0;
1276     row = fill[n];
1277     while ( row < prow ) {
1278       incrlev = im[row] + 1;
1279       nz      = dloc[row];
1280       xi      = ajnew  + ainew[row] + nz + 1;
1281       flev    = ajfill + ainew[row] + nz + 1;
1282       nnz     = ainew[row+1] - ainew[row] - nz - 1;
1283       fm      = row;
1284       while (nnz-- > 0) {
1285         idx = *xi++;
1286         if (*flev + incrlev > levels) {
1287           flev++;
1288           continue;
1289         }
1290         do {
1291           m  = fm;
1292           fm = fill[m];
1293         } while (fm < idx);
1294         if (fm != idx) {
1295           im[idx]   = *flev + incrlev;
1296           fill[m]   = idx;
1297           fill[idx] = fm;
1298           fm        = idx;
1299           nzf++;
1300         } else {
1301           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
1302         }
1303         flev++;
1304       }
1305       row = fill[row];
1306       nzi++;
1307     }
1308     /* copy new filled row into permanent storage */
1309     ainew[prow+1] = ainew[prow] + nzf;
1310     if (ainew[prow+1] > jmax) {
1311 
1312       /* estimate how much additional space we will need */
1313       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1314       /* just double the memory each time */
1315       int maxadd = jmax;
1316       /* maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); */
1317       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
1318       jmax += maxadd;
1319 
1320       /* allocate a longer ajnew and ajfill */
1321       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
1322       ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
1323       ierr = PetscFree(ajnew);CHKERRQ(ierr);
1324       ajnew = xi;
1325       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
1326       ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
1327       ierr = PetscFree(ajfill);CHKERRQ(ierr);
1328       ajfill = xi;
1329       realloc++; /* count how many reallocations are needed */
1330     }
1331     xi          = ajnew + ainew[prow];
1332     flev        = ajfill + ainew[prow];
1333     dloc[prow]  = nzi;
1334     fm          = fill[n];
1335     while (nzf--) {
1336       *xi++   = fm;
1337       *flev++ = im[fm];
1338       fm      = fill[fm];
1339     }
1340     /* make sure row has diagonal entry */
1341     if (ajnew[ainew[prow]+dloc[prow]] != prow) {
1342       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,1,"Row %d has missing diagonal in factored matrix\n\
1343     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
1344     }
1345   }
1346   ierr = PetscFree(ajfill);CHKERRQ(ierr);
1347   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1348   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1349   ierr = PetscFree(fill);CHKERRQ(ierr);
1350   ierr = PetscFree(im);CHKERRQ(ierr);
1351 
1352   {
1353     double af = ((double)ainew[n])/((double)ai[n]);
1354     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
1355              realloc,f,af);
1356     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
1357     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
1358     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
1359     if (diagonal_fill) {
1360       PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
1361     }
1362   }
1363 
1364   /* put together the new matrix */
1365   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
1366   PLogObjectParent(*fact,isicol);
1367   b = (Mat_SeqBAIJ *) (*fact)->data;
1368   ierr = PetscFree(b->imax);CHKERRQ(ierr);
1369   b->singlemalloc = 0;
1370   len = bs2*ainew[n]*sizeof(MatScalar);
1371   /* the next line frees the default space generated by the Create() */
1372   ierr = PetscFree(b->a);CHKERRQ(ierr);
1373   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
1374   b->a          = (MatScalar *) PetscMalloc( len );CHKPTRQ(b->a);
1375   b->j          = ajnew;
1376   b->i          = ainew;
1377   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
1378   b->diag       = dloc;
1379   b->ilen       = 0;
1380   b->imax       = 0;
1381   b->row        = isrow;
1382   b->col        = iscol;
1383   b->icol       = isicol;
1384   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
1385   /* In b structure:  Free imax, ilen, old a, old j.
1386      Allocate dloc, solve_work, new a, new j */
1387   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
1388   b->maxnz          = b->nz = ainew[n];
1389   (*fact)->factor   = FACTOR_LU;
1390 
1391   (*fact)->info.factor_mallocs    = realloc;
1392   (*fact)->info.fill_ratio_given  = f;
1393   (*fact)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[prow]);
1394 
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 
1399 
1400 
1401