xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision c60f02097ec1bad1d486ea4e6635560ad6843df9)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 
10 #undef __FUNCT__
11 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
12 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
13 {
14   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
15   PetscErrorCode ierr;
16   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17   PetscInt       *diag = a->diag;
18   MatScalar      *aa=a->a,*v;
19   PetscScalar    s1,*x,*b;
20 
21   PetscFunctionBegin;
22   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25 
26   /* forward solve the U^T */
27   for (i=0; i<n; i++) {
28 
29     v     = aa + diag[i];
30     /* multiply by the inverse of the block diagonal */
31     s1    = (*v++)*x[i];
32     vi    = aj + diag[i] + 1;
33     nz    = ai[i+1] - diag[i] - 1;
34     while (nz--) {
35       x[*vi++]  -= (*v++)*s1;
36     }
37     x[i]   = s1;
38   }
39   /* backward solve the L^T */
40   for (i=n-1; i>=0; i--){
41     v    = aa + diag[i] - 1;
42     vi   = aj + diag[i] - 1;
43     nz   = diag[i] - ai[i];
44     s1   = x[i];
45     while (nz--) {
46       x[*vi--]   -=  (*v--)*s1;
47     }
48   }
49   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
52   PetscFunctionReturn(0);
53 }
54 
55 #undef __FUNCT__
56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
57 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58 {
59   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
60   PetscErrorCode ierr;
61   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
62   PetscInt       *diag = a->diag,oidx;
63   MatScalar      *aa=a->a,*v;
64   PetscScalar    s1,s2,x1,x2;
65   PetscScalar    *x,*b;
66 
67   PetscFunctionBegin;
68   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
69   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
70   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
71 
72   /* forward solve the U^T */
73   idx = 0;
74   for (i=0; i<n; i++) {
75 
76     v     = aa + 4*diag[i];
77     /* multiply by the inverse of the block diagonal */
78     x1 = x[idx];   x2 = x[1+idx];
79     s1 = v[0]*x1  +  v[1]*x2;
80     s2 = v[2]*x1  +  v[3]*x2;
81     v += 4;
82 
83     vi    = aj + diag[i] + 1;
84     nz    = ai[i+1] - diag[i] - 1;
85     while (nz--) {
86       oidx = 2*(*vi++);
87       x[oidx]   -= v[0]*s1  +  v[1]*s2;
88       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
89       v  += 4;
90     }
91     x[idx]   = s1;x[1+idx] = s2;
92     idx += 2;
93   }
94   /* backward solve the L^T */
95   for (i=n-1; i>=0; i--){
96     v    = aa + 4*diag[i] - 4;
97     vi   = aj + diag[i] - 1;
98     nz   = diag[i] - ai[i];
99     idt  = 2*i;
100     s1   = x[idt];  s2 = x[1+idt];
101     while (nz--) {
102       idx   = 2*(*vi--);
103       x[idx]   -=  v[0]*s1 +  v[1]*s2;
104       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
105       v -= 4;
106     }
107   }
108   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
109   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
110   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
116 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
117 {
118   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119   PetscErrorCode ierr;
120   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
121   PetscInt       *diag = a->diag,oidx;
122   MatScalar      *aa=a->a,*v;
123   PetscScalar    s1,s2,s3,x1,x2,x3;
124   PetscScalar    *x,*b;
125 
126   PetscFunctionBegin;
127   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
128   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
130 
131   /* forward solve the U^T */
132   idx = 0;
133   for (i=0; i<n; i++) {
134 
135     v     = aa + 9*diag[i];
136     /* multiply by the inverse of the block diagonal */
137     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
138     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
139     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
140     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
141     v += 9;
142 
143     vi    = aj + diag[i] + 1;
144     nz    = ai[i+1] - diag[i] - 1;
145     while (nz--) {
146       oidx = 3*(*vi++);
147       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
148       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
149       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
150       v  += 9;
151     }
152     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
153     idx += 3;
154   }
155   /* backward solve the L^T */
156   for (i=n-1; i>=0; i--){
157     v    = aa + 9*diag[i] - 9;
158     vi   = aj + diag[i] - 1;
159     nz   = diag[i] - ai[i];
160     idt  = 3*i;
161     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
162     while (nz--) {
163       idx   = 3*(*vi--);
164       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
165       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
166       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
167       v -= 9;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 16*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
201     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
202     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
203     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
204     v += 16;
205 
206     vi    = aj + diag[i] + 1;
207     nz    = ai[i+1] - diag[i] - 1;
208     while (nz--) {
209       oidx = 4*(*vi++);
210       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
211       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
212       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
213       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
214       v  += 16;
215     }
216     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
217     idx += 4;
218   }
219   /* backward solve the L^T */
220   for (i=n-1; i>=0; i--){
221     v    = aa + 16*diag[i] - 16;
222     vi   = aj + diag[i] - 1;
223     nz   = diag[i] - ai[i];
224     idt  = 4*i;
225     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
226     while (nz--) {
227       idx   = 4*(*vi--);
228       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
229       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
230       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
231       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
232       v -= 16;
233     }
234   }
235   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
236   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
237   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
243 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
244 {
245   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
246   PetscErrorCode ierr;
247   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
248   PetscInt       *diag = a->diag,oidx;
249   MatScalar      *aa=a->a,*v;
250   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
251   PetscScalar    *x,*b;
252 
253   PetscFunctionBegin;
254   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
255   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
256   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
257 
258   /* forward solve the U^T */
259   idx = 0;
260   for (i=0; i<n; i++) {
261 
262     v     = aa + 25*diag[i];
263     /* multiply by the inverse of the block diagonal */
264     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
265     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
266     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
267     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
268     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
269     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
270     v += 25;
271 
272     vi    = aj + diag[i] + 1;
273     nz    = ai[i+1] - diag[i] - 1;
274     while (nz--) {
275       oidx = 5*(*vi++);
276       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
277       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
278       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
279       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
280       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
281       v  += 25;
282     }
283     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
284     idx += 5;
285   }
286   /* backward solve the L^T */
287   for (i=n-1; i>=0; i--){
288     v    = aa + 25*diag[i] - 25;
289     vi   = aj + diag[i] - 1;
290     nz   = diag[i] - ai[i];
291     idt  = 5*i;
292     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
293     while (nz--) {
294       idx   = 5*(*vi--);
295       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
296       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
297       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
298       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
299       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
300       v -= 25;
301     }
302   }
303   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
304   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
305   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
306   PetscFunctionReturn(0);
307 }
308 
309 #undef __FUNCT__
310 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
311 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
312 {
313   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
314   PetscErrorCode ierr;
315   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
316   PetscInt       *diag = a->diag,oidx;
317   MatScalar      *aa=a->a,*v;
318   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
319   PetscScalar    *x,*b;
320 
321   PetscFunctionBegin;
322   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
323   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
324   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
325 
326   /* forward solve the U^T */
327   idx = 0;
328   for (i=0; i<n; i++) {
329 
330     v     = aa + 36*diag[i];
331     /* multiply by the inverse of the block diagonal */
332     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
333     x6    = x[5+idx];
334     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
335     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
336     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
337     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
338     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
339     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
340     v += 36;
341 
342     vi    = aj + diag[i] + 1;
343     nz    = ai[i+1] - diag[i] - 1;
344     while (nz--) {
345       oidx = 6*(*vi++);
346       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
347       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
348       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
349       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
350       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
351       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
352       v  += 36;
353     }
354     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
355     x[5+idx] = s6;
356     idx += 6;
357   }
358   /* backward solve the L^T */
359   for (i=n-1; i>=0; i--){
360     v    = aa + 36*diag[i] - 36;
361     vi   = aj + diag[i] - 1;
362     nz   = diag[i] - ai[i];
363     idt  = 6*i;
364     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
365     s6 = x[5+idt];
366     while (nz--) {
367       idx   = 6*(*vi--);
368       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
369       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
370       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
371       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
372       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
373       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
374       v -= 36;
375     }
376   }
377   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
378   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
379   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
380   PetscFunctionReturn(0);
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
385 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
386 {
387   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
388   PetscErrorCode ierr;
389   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
390   PetscInt       *diag = a->diag,oidx;
391   MatScalar      *aa=a->a,*v;
392   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
393   PetscScalar    *x,*b;
394 
395   PetscFunctionBegin;
396   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
397   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
398   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
399 
400   /* forward solve the U^T */
401   idx = 0;
402   for (i=0; i<n; i++) {
403 
404     v     = aa + 49*diag[i];
405     /* multiply by the inverse of the block diagonal */
406     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
407     x6    = x[5+idx]; x7 = x[6+idx];
408     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
409     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
410     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
411     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
412     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
413     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
414     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
415     v += 49;
416 
417     vi    = aj + diag[i] + 1;
418     nz    = ai[i+1] - diag[i] - 1;
419     while (nz--) {
420       oidx = 7*(*vi++);
421       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
422       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
423       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
424       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
425       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
426       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
427       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
428       v  += 49;
429     }
430     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
431     x[5+idx] = s6;x[6+idx] = s7;
432     idx += 7;
433   }
434   /* backward solve the L^T */
435   for (i=n-1; i>=0; i--){
436     v    = aa + 49*diag[i] - 49;
437     vi   = aj + diag[i] - 1;
438     nz   = diag[i] - ai[i];
439     idt  = 7*i;
440     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
441     s6 = x[5+idt];s7 = x[6+idt];
442     while (nz--) {
443       idx   = 7*(*vi--);
444       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
445       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
446       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
447       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
448       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
449       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
450       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
451       v -= 49;
452     }
453   }
454   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
455   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 /*---------------------------------------------------------------------------------------------*/
461 #undef __FUNCT__
462 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
463 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
464 {
465   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
466   IS             iscol=a->col,isrow=a->row;
467   PetscErrorCode ierr;
468   const PetscInt *r,*c,*rout,*cout;
469   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
470   PetscInt       *diag = a->diag;
471   MatScalar      *aa=a->a,*v;
472   PetscScalar    s1,*x,*b,*t;
473 
474   PetscFunctionBegin;
475   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477   t  = a->solve_work;
478 
479   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481 
482   /* copy the b into temp work space according to permutation */
483   for (i=0; i<n; i++) {
484     t[i] = b[c[i]];
485   }
486 
487   /* forward solve the U^T */
488   for (i=0; i<n; i++) {
489 
490     v     = aa + diag[i];
491     /* multiply by the inverse of the block diagonal */
492     s1    = (*v++)*t[i];
493     vi    = aj + diag[i] + 1;
494     nz    = ai[i+1] - diag[i] - 1;
495     while (nz--) {
496       t[*vi++]  -= (*v++)*s1;
497     }
498     t[i]   = s1;
499   }
500   /* backward solve the L^T */
501   for (i=n-1; i>=0; i--){
502     v    = aa + diag[i] - 1;
503     vi   = aj + diag[i] - 1;
504     nz   = diag[i] - ai[i];
505     s1   = t[i];
506     while (nz--) {
507       t[*vi--]   -=  (*v--)*s1;
508     }
509   }
510 
511   /* copy t into x according to permutation */
512   for (i=0; i<n; i++) {
513     x[r[i]]   = t[i];
514   }
515 
516   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
518   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
521   PetscFunctionReturn(0);
522 }
523 
524 #undef __FUNCT__
525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527 {
528   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529   IS             iscol=a->col,isrow=a->row;
530   PetscErrorCode ierr;
531   const PetscInt *r,*c,*rout,*cout;
532   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
533   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
534   MatScalar      *aa=a->a,*v;
535   PetscScalar    s1,s2,x1,x2;
536   PetscScalar    *x,*b,*t;
537 
538   PetscFunctionBegin;
539   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
540   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
541   t  = a->solve_work;
542 
543   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
544   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
545 
546   /* copy the b into temp work space according to permutation */
547   ii = 0;
548   for (i=0; i<n; i++) {
549     ic      = 2*c[i];
550     t[ii]   = b[ic];
551     t[ii+1] = b[ic+1];
552     ii += 2;
553   }
554 
555   /* forward solve the U^T */
556   idx = 0;
557   for (i=0; i<n; i++) {
558 
559     v     = aa + 4*diag[i];
560     /* multiply by the inverse of the block diagonal */
561     x1    = t[idx];   x2 = t[1+idx];
562     s1 = v[0]*x1  +  v[1]*x2;
563     s2 = v[2]*x1  +  v[3]*x2;
564     v += 4;
565 
566     vi    = aj + diag[i] + 1;
567     nz    = ai[i+1] - diag[i] - 1;
568     while (nz--) {
569       oidx = 2*(*vi++);
570       t[oidx]   -= v[0]*s1  +  v[1]*s2;
571       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
572       v  += 4;
573     }
574     t[idx]   = s1;t[1+idx] = s2;
575     idx += 2;
576   }
577   /* backward solve the L^T */
578   for (i=n-1; i>=0; i--){
579     v    = aa + 4*diag[i] - 4;
580     vi   = aj + diag[i] - 1;
581     nz   = diag[i] - ai[i];
582     idt  = 2*i;
583     s1 = t[idt];  s2 = t[1+idt];
584     while (nz--) {
585       idx   = 2*(*vi--);
586       t[idx]   -=  v[0]*s1 +  v[1]*s2;
587       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
588       v -= 4;
589     }
590   }
591 
592   /* copy t into x according to permutation */
593   ii = 0;
594   for (i=0; i<n; i++) {
595     ir      = 2*r[i];
596     x[ir]   = t[ii];
597     x[ir+1] = t[ii+1];
598     ii += 2;
599   }
600 
601   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
602   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
603   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
604   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
605   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
606   PetscFunctionReturn(0);
607 }
608 
609 #undef __FUNCT__
610 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
611 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
612 {
613   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
614   IS             iscol=a->col,isrow=a->row;
615   PetscErrorCode ierr;
616   const PetscInt *r,*c,*rout,*cout;
617   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
618   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
619   MatScalar      *aa=a->a,*v;
620   PetscScalar    s1,s2,s3,x1,x2,x3;
621   PetscScalar    *x,*b,*t;
622 
623   PetscFunctionBegin;
624   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
625   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
626   t  = a->solve_work;
627 
628   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
629   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
630 
631   /* copy the b into temp work space according to permutation */
632   ii = 0;
633   for (i=0; i<n; i++) {
634     ic      = 3*c[i];
635     t[ii]   = b[ic];
636     t[ii+1] = b[ic+1];
637     t[ii+2] = b[ic+2];
638     ii += 3;
639   }
640 
641   /* forward solve the U^T */
642   idx = 0;
643   for (i=0; i<n; i++) {
644 
645     v     = aa + 9*diag[i];
646     /* multiply by the inverse of the block diagonal */
647     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
648     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
649     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
650     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
651     v += 9;
652 
653     vi    = aj + diag[i] + 1;
654     nz    = ai[i+1] - diag[i] - 1;
655     while (nz--) {
656       oidx = 3*(*vi++);
657       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
658       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
659       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
660       v  += 9;
661     }
662     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
663     idx += 3;
664   }
665   /* backward solve the L^T */
666   for (i=n-1; i>=0; i--){
667     v    = aa + 9*diag[i] - 9;
668     vi   = aj + diag[i] - 1;
669     nz   = diag[i] - ai[i];
670     idt  = 3*i;
671     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
672     while (nz--) {
673       idx   = 3*(*vi--);
674       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
675       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
676       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
677       v -= 9;
678     }
679   }
680 
681   /* copy t into x according to permutation */
682   ii = 0;
683   for (i=0; i<n; i++) {
684     ir      = 3*r[i];
685     x[ir]   = t[ii];
686     x[ir+1] = t[ii+1];
687     x[ir+2] = t[ii+2];
688     ii += 3;
689   }
690 
691   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
692   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
693   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
695   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 #undef __FUNCT__
700 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
701 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
702 {
703   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
704   IS             iscol=a->col,isrow=a->row;
705   PetscErrorCode ierr;
706   const PetscInt *r,*c,*rout,*cout;
707   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
708   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
709   MatScalar      *aa=a->a,*v;
710   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
711   PetscScalar    *x,*b,*t;
712 
713   PetscFunctionBegin;
714   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
715   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
716   t  = a->solve_work;
717 
718   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
719   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
720 
721   /* copy the b into temp work space according to permutation */
722   ii = 0;
723   for (i=0; i<n; i++) {
724     ic      = 4*c[i];
725     t[ii]   = b[ic];
726     t[ii+1] = b[ic+1];
727     t[ii+2] = b[ic+2];
728     t[ii+3] = b[ic+3];
729     ii += 4;
730   }
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 16*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
739     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
740     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
741     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
742     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
743     v += 16;
744 
745     vi    = aj + diag[i] + 1;
746     nz    = ai[i+1] - diag[i] - 1;
747     while (nz--) {
748       oidx = 4*(*vi++);
749       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
750       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
751       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
752       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
753       v  += 16;
754     }
755     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
756     idx += 4;
757   }
758   /* backward solve the L^T */
759   for (i=n-1; i>=0; i--){
760     v    = aa + 16*diag[i] - 16;
761     vi   = aj + diag[i] - 1;
762     nz   = diag[i] - ai[i];
763     idt  = 4*i;
764     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
765     while (nz--) {
766       idx   = 4*(*vi--);
767       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
768       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
769       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
770       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
771       v -= 16;
772     }
773   }
774 
775   /* copy t into x according to permutation */
776   ii = 0;
777   for (i=0; i<n; i++) {
778     ir      = 4*r[i];
779     x[ir]   = t[ii];
780     x[ir+1] = t[ii+1];
781     x[ir+2] = t[ii+2];
782     x[ir+3] = t[ii+3];
783     ii += 4;
784   }
785 
786   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
787   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
788   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
789   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
790   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
791   PetscFunctionReturn(0);
792 }
793 
794 #undef __FUNCT__
795 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
796 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
797 {
798   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
799   IS             iscol=a->col,isrow=a->row;
800   PetscErrorCode ierr;
801   const PetscInt *r,*c,*rout,*cout;
802   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
803   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
804   MatScalar      *aa=a->a,*v;
805   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
806   PetscScalar    *x,*b,*t;
807 
808   PetscFunctionBegin;
809   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
810   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
811   t  = a->solve_work;
812 
813   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
814   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
815 
816   /* copy the b into temp work space according to permutation */
817   ii = 0;
818   for (i=0; i<n; i++) {
819     ic      = 5*c[i];
820     t[ii]   = b[ic];
821     t[ii+1] = b[ic+1];
822     t[ii+2] = b[ic+2];
823     t[ii+3] = b[ic+3];
824     t[ii+4] = b[ic+4];
825     ii += 5;
826   }
827 
828   /* forward solve the U^T */
829   idx = 0;
830   for (i=0; i<n; i++) {
831 
832     v     = aa + 25*diag[i];
833     /* multiply by the inverse of the block diagonal */
834     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
835     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
836     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
837     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
838     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
839     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
840     v += 25;
841 
842     vi    = aj + diag[i] + 1;
843     nz    = ai[i+1] - diag[i] - 1;
844     while (nz--) {
845       oidx = 5*(*vi++);
846       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
847       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
848       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
849       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
850       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
851       v  += 25;
852     }
853     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
854     idx += 5;
855   }
856   /* backward solve the L^T */
857   for (i=n-1; i>=0; i--){
858     v    = aa + 25*diag[i] - 25;
859     vi   = aj + diag[i] - 1;
860     nz   = diag[i] - ai[i];
861     idt  = 5*i;
862     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
863     while (nz--) {
864       idx   = 5*(*vi--);
865       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
866       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
867       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
868       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
869       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
870       v -= 25;
871     }
872   }
873 
874   /* copy t into x according to permutation */
875   ii = 0;
876   for (i=0; i<n; i++) {
877     ir      = 5*r[i];
878     x[ir]   = t[ii];
879     x[ir+1] = t[ii+1];
880     x[ir+2] = t[ii+2];
881     x[ir+3] = t[ii+3];
882     x[ir+4] = t[ii+4];
883     ii += 5;
884   }
885 
886   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
887   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
888   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
889   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
890   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
891   PetscFunctionReturn(0);
892 }
893 
894 #undef __FUNCT__
895 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
896 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
897 {
898   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
899   IS             iscol=a->col,isrow=a->row;
900   PetscErrorCode ierr;
901   const PetscInt *r,*c,*rout,*cout;
902   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
903   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
904   MatScalar      *aa=a->a,*v;
905   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
906   PetscScalar    *x,*b,*t;
907 
908   PetscFunctionBegin;
909   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
910   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
911   t  = a->solve_work;
912 
913   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
914   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
915 
916   /* copy the b into temp work space according to permutation */
917   ii = 0;
918   for (i=0; i<n; i++) {
919     ic      = 6*c[i];
920     t[ii]   = b[ic];
921     t[ii+1] = b[ic+1];
922     t[ii+2] = b[ic+2];
923     t[ii+3] = b[ic+3];
924     t[ii+4] = b[ic+4];
925     t[ii+5] = b[ic+5];
926     ii += 6;
927   }
928 
929   /* forward solve the U^T */
930   idx = 0;
931   for (i=0; i<n; i++) {
932 
933     v     = aa + 36*diag[i];
934     /* multiply by the inverse of the block diagonal */
935     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
936     x6    = t[5+idx];
937     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
938     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
939     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
940     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
941     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
942     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
943     v += 36;
944 
945     vi    = aj + diag[i] + 1;
946     nz    = ai[i+1] - diag[i] - 1;
947     while (nz--) {
948       oidx = 6*(*vi++);
949       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
950       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
951       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
952       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
953       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
954       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
955       v  += 36;
956     }
957     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
958     t[5+idx] = s6;
959     idx += 6;
960   }
961   /* backward solve the L^T */
962   for (i=n-1; i>=0; i--){
963     v    = aa + 36*diag[i] - 36;
964     vi   = aj + diag[i] - 1;
965     nz   = diag[i] - ai[i];
966     idt  = 6*i;
967     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
968     s6 = t[5+idt];
969     while (nz--) {
970       idx   = 6*(*vi--);
971       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
972       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
973       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
974       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
975       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
976       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
977       v -= 36;
978     }
979   }
980 
981   /* copy t into x according to permutation */
982   ii = 0;
983   for (i=0; i<n; i++) {
984     ir      = 6*r[i];
985     x[ir]   = t[ii];
986     x[ir+1] = t[ii+1];
987     x[ir+2] = t[ii+2];
988     x[ir+3] = t[ii+3];
989     x[ir+4] = t[ii+4];
990     x[ir+5] = t[ii+5];
991     ii += 6;
992   }
993 
994   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
995   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
996   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
997   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
998   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
999   PetscFunctionReturn(0);
1000 }
1001 
1002 #undef __FUNCT__
1003 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1004 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1005 {
1006   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1007   IS             iscol=a->col,isrow=a->row;
1008   PetscErrorCode ierr;
1009   const PetscInt *r,*c,*rout,*cout;
1010   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1011   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1012   MatScalar      *aa=a->a,*v;
1013   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1014   PetscScalar    *x,*b,*t;
1015 
1016   PetscFunctionBegin;
1017   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1018   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1019   t  = a->solve_work;
1020 
1021   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1022   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1023 
1024   /* copy the b into temp work space according to permutation */
1025   ii = 0;
1026   for (i=0; i<n; i++) {
1027     ic      = 7*c[i];
1028     t[ii]   = b[ic];
1029     t[ii+1] = b[ic+1];
1030     t[ii+2] = b[ic+2];
1031     t[ii+3] = b[ic+3];
1032     t[ii+4] = b[ic+4];
1033     t[ii+5] = b[ic+5];
1034     t[ii+6] = b[ic+6];
1035     ii += 7;
1036   }
1037 
1038   /* forward solve the U^T */
1039   idx = 0;
1040   for (i=0; i<n; i++) {
1041 
1042     v     = aa + 49*diag[i];
1043     /* multiply by the inverse of the block diagonal */
1044     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1045     x6    = t[5+idx]; x7 = t[6+idx];
1046     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1047     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1048     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1049     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1050     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1051     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1052     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1053     v += 49;
1054 
1055     vi    = aj + diag[i] + 1;
1056     nz    = ai[i+1] - diag[i] - 1;
1057     while (nz--) {
1058       oidx = 7*(*vi++);
1059       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1060       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1061       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1062       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1063       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1064       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1065       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1066       v  += 49;
1067     }
1068     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1069     t[5+idx] = s6;t[6+idx] = s7;
1070     idx += 7;
1071   }
1072   /* backward solve the L^T */
1073   for (i=n-1; i>=0; i--){
1074     v    = aa + 49*diag[i] - 49;
1075     vi   = aj + diag[i] - 1;
1076     nz   = diag[i] - ai[i];
1077     idt  = 7*i;
1078     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1079     s6 = t[5+idt];s7 = t[6+idt];
1080     while (nz--) {
1081       idx   = 7*(*vi--);
1082       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1083       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1084       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1085       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1086       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1087       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1088       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1089       v -= 49;
1090     }
1091   }
1092 
1093   /* copy t into x according to permutation */
1094   ii = 0;
1095   for (i=0; i<n; i++) {
1096     ir      = 7*r[i];
1097     x[ir]   = t[ii];
1098     x[ir+1] = t[ii+1];
1099     x[ir+2] = t[ii+2];
1100     x[ir+3] = t[ii+3];
1101     x[ir+4] = t[ii+4];
1102     x[ir+5] = t[ii+5];
1103     x[ir+6] = t[ii+6];
1104     ii += 7;
1105   }
1106 
1107   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1108   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1111   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 /* ----------------------------------------------------------- */
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1118 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1119 {
1120   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1121   IS             iscol=a->col,isrow=a->row;
1122   PetscErrorCode ierr;
1123   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1124   PetscInt       i,n=a->mbs;
1125   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1126   MatScalar      *aa=a->a,*v;
1127   PetscScalar    *x,*b,*s,*t,*ls;
1128 
1129   PetscFunctionBegin;
1130   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1131   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1132   t  = a->solve_work;
1133 
1134   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1135   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1136 
1137   /* forward solve the lower triangular */
1138   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1139   for (i=1; i<n; i++) {
1140     v   = aa + bs2*ai[i];
1141     vi  = aj + ai[i];
1142     nz  = a->diag[i] - ai[i];
1143     s = t + bs*i;
1144     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1145     while (nz--) {
1146       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1147       v += bs2;
1148     }
1149   }
1150   /* backward solve the upper triangular */
1151   ls = a->solve_work + A->cmap->n;
1152   for (i=n-1; i>=0; i--){
1153     v   = aa + bs2*(a->diag[i] + 1);
1154     vi  = aj + a->diag[i] + 1;
1155     nz  = ai[i+1] - a->diag[i] - 1;
1156     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1157     while (nz--) {
1158       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1159       v += bs2;
1160     }
1161     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1162     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1163   }
1164 
1165   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1166   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1169   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 #undef __FUNCT__
1174 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1175 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1176 {
1177   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1178   IS             iscol=a->col,isrow=a->row;
1179   PetscErrorCode ierr;
1180   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1181   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1182   MatScalar      *aa=a->a,*v;
1183   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1184   PetscScalar    *x,*b,*t;
1185 
1186   PetscFunctionBegin;
1187   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1188   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1189   t  = a->solve_work;
1190 
1191   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1192   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1193 
1194   /* forward solve the lower triangular */
1195   idx    = 7*(*r++);
1196   t[0] = b[idx];   t[1] = b[1+idx];
1197   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1198   t[5] = b[5+idx]; t[6] = b[6+idx];
1199 
1200   for (i=1; i<n; i++) {
1201     v     = aa + 49*ai[i];
1202     vi    = aj + ai[i];
1203     nz    = diag[i] - ai[i];
1204     idx   = 7*(*r++);
1205     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1206     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1207     while (nz--) {
1208       idx   = 7*(*vi++);
1209       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1210       x4    = t[3+idx];x5 = t[4+idx];
1211       x6    = t[5+idx];x7 = t[6+idx];
1212       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1213       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1214       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1215       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1216       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1217       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1218       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1219       v += 49;
1220     }
1221     idx = 7*i;
1222     t[idx]   = s1;t[1+idx] = s2;
1223     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1224     t[5+idx] = s6;t[6+idx] = s7;
1225   }
1226   /* backward solve the upper triangular */
1227   for (i=n-1; i>=0; i--){
1228     v    = aa + 49*diag[i] + 49;
1229     vi   = aj + diag[i] + 1;
1230     nz   = ai[i+1] - diag[i] - 1;
1231     idt  = 7*i;
1232     s1 = t[idt];  s2 = t[1+idt];
1233     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1234     s6 = t[5+idt];s7 = t[6+idt];
1235     while (nz--) {
1236       idx   = 7*(*vi++);
1237       x1    = t[idx];   x2 = t[1+idx];
1238       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1239       x6    = t[5+idx]; x7 = t[6+idx];
1240       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1241       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1242       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1243       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1244       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1245       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1246       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1247       v += 49;
1248     }
1249     idc = 7*(*c--);
1250     v   = aa + 49*diag[i];
1251     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1252                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1253     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1254                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1255     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1256                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1257     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1258                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1259     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1260                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1261     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1262                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1263     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1264                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1265   }
1266 
1267   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1268   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1269   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1270   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1271   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 #undef __FUNCT__
1276 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1277 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1278 {
1279   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1280   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1281   PetscErrorCode    ierr;
1282   PetscInt          *diag = a->diag,jdx;
1283   const MatScalar   *aa=a->a,*v;
1284   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1285   const PetscScalar *b;
1286 
1287   PetscFunctionBegin;
1288   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1290   /* forward solve the lower triangular */
1291   idx    = 0;
1292   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1293   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1294   x[6] = b[6+idx];
1295   for (i=1; i<n; i++) {
1296     v     =  aa + 49*ai[i];
1297     vi    =  aj + ai[i];
1298     nz    =  diag[i] - ai[i];
1299     idx   =  7*i;
1300     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1301     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1302     s7  =  b[6+idx];
1303     while (nz--) {
1304       jdx   = 7*(*vi++);
1305       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1306       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1307       x7    = x[6+jdx];
1308       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1315       v += 49;
1316      }
1317     x[idx]   = s1;
1318     x[1+idx] = s2;
1319     x[2+idx] = s3;
1320     x[3+idx] = s4;
1321     x[4+idx] = s5;
1322     x[5+idx] = s6;
1323     x[6+idx] = s7;
1324   }
1325   /* backward solve the upper triangular */
1326   for (i=n-1; i>=0; i--){
1327     v    = aa + 49*diag[i] + 49;
1328     vi   = aj + diag[i] + 1;
1329     nz   = ai[i+1] - diag[i] - 1;
1330     idt  = 7*i;
1331     s1 = x[idt];   s2 = x[1+idt];
1332     s3 = x[2+idt]; s4 = x[3+idt];
1333     s5 = x[4+idt]; s6 = x[5+idt];
1334     s7 = x[6+idt];
1335     while (nz--) {
1336       idx   = 7*(*vi++);
1337       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1338       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1339       x7    = x[6+idx];
1340       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1341       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1342       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1343       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1344       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1345       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1346       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1347       v += 49;
1348     }
1349     v        = aa + 49*diag[i];
1350     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1351                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1352     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1353                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1354     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1355                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1356     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1357                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1358     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1359                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1360     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1361                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1362     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1363                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1364   }
1365 
1366   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1368   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1374 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1375 {
1376   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1377   IS                iscol=a->col,isrow=a->row;
1378   PetscErrorCode    ierr;
1379   const PetscInt    *r,*c,*rout,*cout;
1380   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1381   const MatScalar   *aa=a->a,*v;
1382   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1383   const PetscScalar *b;
1384   PetscFunctionBegin;
1385   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1387   t  = a->solve_work;
1388 
1389   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1390   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1391 
1392   /* forward solve the lower triangular */
1393   idx    = 6*(*r++);
1394   t[0] = b[idx];   t[1] = b[1+idx];
1395   t[2] = b[2+idx]; t[3] = b[3+idx];
1396   t[4] = b[4+idx]; t[5] = b[5+idx];
1397   for (i=1; i<n; i++) {
1398     v     = aa + 36*ai[i];
1399     vi    = aj + ai[i];
1400     nz    = diag[i] - ai[i];
1401     idx   = 6*(*r++);
1402     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1403     s5  = b[4+idx]; s6 = b[5+idx];
1404     while (nz--) {
1405       idx   = 6*(*vi++);
1406       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1407       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1408       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1414       v += 36;
1415     }
1416     idx = 6*i;
1417     t[idx]   = s1;t[1+idx] = s2;
1418     t[2+idx] = s3;t[3+idx] = s4;
1419     t[4+idx] = s5;t[5+idx] = s6;
1420   }
1421   /* backward solve the upper triangular */
1422   for (i=n-1; i>=0; i--){
1423     v    = aa + 36*diag[i] + 36;
1424     vi   = aj + diag[i] + 1;
1425     nz   = ai[i+1] - diag[i] - 1;
1426     idt  = 6*i;
1427     s1 = t[idt];  s2 = t[1+idt];
1428     s3 = t[2+idt];s4 = t[3+idt];
1429     s5 = t[4+idt];s6 = t[5+idt];
1430     while (nz--) {
1431       idx   = 6*(*vi++);
1432       x1    = t[idx];   x2 = t[1+idx];
1433       x3    = t[2+idx]; x4 = t[3+idx];
1434       x5    = t[4+idx]; x6 = t[5+idx];
1435       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1436       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1437       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1438       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1439       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1440       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1441       v += 36;
1442     }
1443     idc = 6*(*c--);
1444     v   = aa + 36*diag[i];
1445     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1446                                  v[18]*s4+v[24]*s5+v[30]*s6;
1447     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1448                                  v[19]*s4+v[25]*s5+v[31]*s6;
1449     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1450                                  v[20]*s4+v[26]*s5+v[32]*s6;
1451     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1452                                  v[21]*s4+v[27]*s5+v[33]*s6;
1453     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1454                                  v[22]*s4+v[28]*s5+v[34]*s6;
1455     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1456                                  v[23]*s4+v[29]*s5+v[35]*s6;
1457   }
1458 
1459   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1460   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1461   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1462   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1463   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 #undef __FUNCT__
1468 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1469 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1470 {
1471   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1472   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1473   PetscErrorCode    ierr;
1474   PetscInt          *diag = a->diag,jdx;
1475   const MatScalar   *aa=a->a,*v;
1476   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1477   const PetscScalar *b;
1478 
1479   PetscFunctionBegin;
1480   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1481   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1482   /* forward solve the lower triangular */
1483   idx    = 0;
1484   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1485   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1486   for (i=1; i<n; i++) {
1487     v     =  aa + 36*ai[i];
1488     vi    =  aj + ai[i];
1489     nz    =  diag[i] - ai[i];
1490     idx   =  6*i;
1491     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1492     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1493     while (nz--) {
1494       jdx   = 6*(*vi++);
1495       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1496       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1497       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503       v += 36;
1504      }
1505     x[idx]   = s1;
1506     x[1+idx] = s2;
1507     x[2+idx] = s3;
1508     x[3+idx] = s4;
1509     x[4+idx] = s5;
1510     x[5+idx] = s6;
1511   }
1512   /* backward solve the upper triangular */
1513   for (i=n-1; i>=0; i--){
1514     v    = aa + 36*diag[i] + 36;
1515     vi   = aj + diag[i] + 1;
1516     nz   = ai[i+1] - diag[i] - 1;
1517     idt  = 6*i;
1518     s1 = x[idt];   s2 = x[1+idt];
1519     s3 = x[2+idt]; s4 = x[3+idt];
1520     s5 = x[4+idt]; s6 = x[5+idt];
1521     while (nz--) {
1522       idx   = 6*(*vi++);
1523       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1524       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1525       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1526       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1527       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1528       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1529       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1530       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1531       v += 36;
1532     }
1533     v        = aa + 36*diag[i];
1534     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1535     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1536     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1537     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1538     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1539     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1540   }
1541 
1542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1550 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553   IS                iscol=a->col,isrow=a->row;
1554   PetscErrorCode    ierr;
1555   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1556   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1557   const MatScalar   *aa=a->a,*v;
1558   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1559   const PetscScalar *b;
1560 
1561   PetscFunctionBegin;
1562   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1563   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1564   t  = a->solve_work;
1565 
1566   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1567   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1568 
1569   /* forward solve the lower triangular */
1570   idx    = 5*(*r++);
1571   t[0] = b[idx];   t[1] = b[1+idx];
1572   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1573   for (i=1; i<n; i++) {
1574     v     = aa + 25*ai[i];
1575     vi    = aj + ai[i];
1576     nz    = diag[i] - ai[i];
1577     idx   = 5*(*r++);
1578     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1579     s5  = b[4+idx];
1580     while (nz--) {
1581       idx   = 5*(*vi++);
1582       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1583       x4    = t[3+idx];x5 = t[4+idx];
1584       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1585       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1586       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1587       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1588       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1589       v += 25;
1590     }
1591     idx = 5*i;
1592     t[idx]   = s1;t[1+idx] = s2;
1593     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1594   }
1595   /* backward solve the upper triangular */
1596   for (i=n-1; i>=0; i--){
1597     v    = aa + 25*diag[i] + 25;
1598     vi   = aj + diag[i] + 1;
1599     nz   = ai[i+1] - diag[i] - 1;
1600     idt  = 5*i;
1601     s1 = t[idt];  s2 = t[1+idt];
1602     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1603     while (nz--) {
1604       idx   = 5*(*vi++);
1605       x1    = t[idx];   x2 = t[1+idx];
1606       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1607       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1608       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1609       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1610       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1611       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1612       v += 25;
1613     }
1614     idc = 5*(*c--);
1615     v   = aa + 25*diag[i];
1616     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1617                                  v[15]*s4+v[20]*s5;
1618     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1619                                  v[16]*s4+v[21]*s5;
1620     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1621                                  v[17]*s4+v[22]*s5;
1622     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1623                                  v[18]*s4+v[23]*s5;
1624     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1625                                  v[19]*s4+v[24]*s5;
1626   }
1627 
1628   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1629   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1630   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1632   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1637 {
1638   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1639   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1640   PetscErrorCode    ierr;
1641   PetscInt          jdx;
1642   const MatScalar   *aa=a->a,*v;
1643   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1644   const PetscScalar *b;
1645 
1646   PetscFunctionBegin;
1647   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1648   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1649   /* forward solve the lower triangular */
1650   idx    = 0;
1651   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1652   for (i=1; i<n; i++) {
1653     v   = aa + 25*ai[i];
1654     vi  = aj + ai[i];
1655     nz  = ai[i+1] - ai[i];
1656     idx = 5*i;
1657     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1658     while (nz--) {
1659       jdx   = 5*(*vi++);
1660       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1661       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1662       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1663       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1664       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1665       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1666       v    += 25;
1667     }
1668     x[idx]   = s1;
1669     x[1+idx] = s2;
1670     x[2+idx] = s3;
1671     x[3+idx] = s4;
1672     x[4+idx] = s5;
1673   }
1674 
1675   /* backward solve the upper triangular */
1676   for (i=n-1; i>=0; i--){
1677     v   = aa + 25*ai[2*n-i];
1678     vi  = aj + ai[2*n-i];
1679     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1680     idt = 5*i;
1681     s1 = x[idt];  s2 = x[1+idt];
1682     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1683     while (nz--) {
1684       idx   = 5*(*vi++);
1685       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1686       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1687       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1688       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1689       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1690       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1691       v    += 25;
1692     }
1693     /* x = inv_diagonal*x */
1694     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1695     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1696     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1697     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1698     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1699   }
1700 
1701   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1703   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 #undef __FUNCT__
1708 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1709 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1710 {
1711   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1712   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1713   PetscErrorCode    ierr;
1714   PetscInt          *diag = a->diag,jdx;
1715   const MatScalar   *aa=a->a,*v;
1716   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1717   const PetscScalar *b;
1718 
1719   PetscFunctionBegin;
1720   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1721   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1722   /* forward solve the lower triangular */
1723   idx    = 0;
1724   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1725   for (i=1; i<n; i++) {
1726     v     =  aa + 25*ai[i];
1727     vi    =  aj + ai[i];
1728     nz    =  diag[i] - ai[i];
1729     idx   =  5*i;
1730     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1731     while (nz--) {
1732       jdx   = 5*(*vi++);
1733       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1734       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1735       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1736       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1737       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1738       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1739       v    += 25;
1740     }
1741     x[idx]   = s1;
1742     x[1+idx] = s2;
1743     x[2+idx] = s3;
1744     x[3+idx] = s4;
1745     x[4+idx] = s5;
1746   }
1747   /* backward solve the upper triangular */
1748   for (i=n-1; i>=0; i--){
1749     v    = aa + 25*diag[i] + 25;
1750     vi   = aj + diag[i] + 1;
1751     nz   = ai[i+1] - diag[i] - 1;
1752     idt  = 5*i;
1753     s1 = x[idt];  s2 = x[1+idt];
1754     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1755     while (nz--) {
1756       idx   = 5*(*vi++);
1757       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1758       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1759       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1760       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1761       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1762       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1763       v    += 25;
1764     }
1765     v        = aa + 25*diag[i];
1766     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1767     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1768     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1769     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1770     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1771   }
1772 
1773   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1774   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 #undef __FUNCT__
1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1781 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1782 {
1783   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784   IS                iscol=a->col,isrow=a->row;
1785   PetscErrorCode    ierr;
1786   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1787   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1788   const MatScalar   *aa=a->a,*v;
1789   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790   const PetscScalar *b;
1791 
1792   PetscFunctionBegin;
1793   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1795   t  = a->solve_work;
1796 
1797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1799 
1800   /* forward solve the lower triangular */
1801   idx    = 4*(*r++);
1802   t[0] = b[idx];   t[1] = b[1+idx];
1803   t[2] = b[2+idx]; t[3] = b[3+idx];
1804   for (i=1; i<n; i++) {
1805     v     = aa + 16*ai[i];
1806     vi    = aj + ai[i];
1807     nz    = diag[i] - ai[i];
1808     idx   = 4*(*r++);
1809     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1810     while (nz--) {
1811       idx   = 4*(*vi++);
1812       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1813       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1814       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1815       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1817       v    += 16;
1818     }
1819     idx        = 4*i;
1820     t[idx]   = s1;t[1+idx] = s2;
1821     t[2+idx] = s3;t[3+idx] = s4;
1822   }
1823   /* backward solve the upper triangular */
1824   for (i=n-1; i>=0; i--){
1825     v    = aa + 16*diag[i] + 16;
1826     vi   = aj + diag[i] + 1;
1827     nz   = ai[i+1] - diag[i] - 1;
1828     idt  = 4*i;
1829     s1 = t[idt];  s2 = t[1+idt];
1830     s3 = t[2+idt];s4 = t[3+idt];
1831     while (nz--) {
1832       idx   = 4*(*vi++);
1833       x1    = t[idx];   x2 = t[1+idx];
1834       x3    = t[2+idx]; x4 = t[3+idx];
1835       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1836       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1837       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1838       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1839       v += 16;
1840     }
1841     idc      = 4*(*c--);
1842     v        = aa + 16*diag[i];
1843     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1844     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1845     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1846     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1847   }
1848 
1849   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1850   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1851   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1852   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1853   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 #undef __FUNCT__
1858 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1859 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1860 {
1861   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1862   IS                iscol=a->col,isrow=a->row;
1863   PetscErrorCode    ierr;
1864   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1865   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1866   const MatScalar   *aa=a->a,*v;
1867   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1868   PetscScalar       *x;
1869   const PetscScalar *b;
1870 
1871   PetscFunctionBegin;
1872   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1873   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1874   t  = (MatScalar *)a->solve_work;
1875 
1876   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1877   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1878 
1879   /* forward solve the lower triangular */
1880   idx    = 4*(*r++);
1881   t[0] = (MatScalar)b[idx];
1882   t[1] = (MatScalar)b[1+idx];
1883   t[2] = (MatScalar)b[2+idx];
1884   t[3] = (MatScalar)b[3+idx];
1885   for (i=1; i<n; i++) {
1886     v     = aa + 16*ai[i];
1887     vi    = aj + ai[i];
1888     nz    = diag[i] - ai[i];
1889     idx   = 4*(*r++);
1890     s1 = (MatScalar)b[idx];
1891     s2 = (MatScalar)b[1+idx];
1892     s3 = (MatScalar)b[2+idx];
1893     s4 = (MatScalar)b[3+idx];
1894     while (nz--) {
1895       idx   = 4*(*vi++);
1896       x1  = t[idx];
1897       x2  = t[1+idx];
1898       x3  = t[2+idx];
1899       x4  = t[3+idx];
1900       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1901       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1902       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1903       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1904       v    += 16;
1905     }
1906     idx        = 4*i;
1907     t[idx]   = s1;
1908     t[1+idx] = s2;
1909     t[2+idx] = s3;
1910     t[3+idx] = s4;
1911   }
1912   /* backward solve the upper triangular */
1913   for (i=n-1; i>=0; i--){
1914     v    = aa + 16*diag[i] + 16;
1915     vi   = aj + diag[i] + 1;
1916     nz   = ai[i+1] - diag[i] - 1;
1917     idt  = 4*i;
1918     s1 = t[idt];
1919     s2 = t[1+idt];
1920     s3 = t[2+idt];
1921     s4 = t[3+idt];
1922     while (nz--) {
1923       idx   = 4*(*vi++);
1924       x1  = t[idx];
1925       x2  = t[1+idx];
1926       x3  = t[2+idx];
1927       x4  = t[3+idx];
1928       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1929       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1930       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1931       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1932       v += 16;
1933     }
1934     idc      = 4*(*c--);
1935     v        = aa + 16*diag[i];
1936     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1937     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1938     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1939     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1940     x[idc]   = (PetscScalar)t[idt];
1941     x[1+idc] = (PetscScalar)t[1+idt];
1942     x[2+idc] = (PetscScalar)t[2+idt];
1943     x[3+idc] = (PetscScalar)t[3+idt];
1944  }
1945 
1946   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1951   PetscFunctionReturn(0);
1952 }
1953 
1954 #if defined (PETSC_HAVE_SSE)
1955 
1956 #include PETSC_HAVE_SSE
1957 
1958 #undef __FUNCT__
1959 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1960 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1961 {
1962   /*
1963      Note: This code uses demotion of double
1964      to float when performing the mixed-mode computation.
1965      This may not be numerically reasonable for all applications.
1966   */
1967   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1968   IS             iscol=a->col,isrow=a->row;
1969   PetscErrorCode ierr;
1970   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1971   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1972   MatScalar      *aa=a->a,*v;
1973   PetscScalar    *x,*b,*t;
1974 
1975   /* Make space in temp stack for 16 Byte Aligned arrays */
1976   float           ssealignedspace[11],*tmps,*tmpx;
1977   unsigned long   offset;
1978 
1979   PetscFunctionBegin;
1980   SSE_SCOPE_BEGIN;
1981 
1982     offset = (unsigned long)ssealignedspace % 16;
1983     if (offset) offset = (16 - offset)/4;
1984     tmps = &ssealignedspace[offset];
1985     tmpx = &ssealignedspace[offset+4];
1986     PREFETCH_NTA(aa+16*ai[1]);
1987 
1988     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1989     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1990     t  = a->solve_work;
1991 
1992     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1993     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1994 
1995     /* forward solve the lower triangular */
1996     idx  = 4*(*r++);
1997     t[0] = b[idx];   t[1] = b[1+idx];
1998     t[2] = b[2+idx]; t[3] = b[3+idx];
1999     v    =  aa + 16*ai[1];
2000 
2001     for (i=1; i<n;) {
2002       PREFETCH_NTA(&v[8]);
2003       vi   =  aj      + ai[i];
2004       nz   =  diag[i] - ai[i];
2005       idx  =  4*(*r++);
2006 
2007       /* Demote sum from double to float */
2008       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2009       LOAD_PS(tmps,XMM7);
2010 
2011       while (nz--) {
2012         PREFETCH_NTA(&v[16]);
2013         idx = 4*(*vi++);
2014 
2015         /* Demote solution (so far) from double to float */
2016         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2017 
2018         /* 4x4 Matrix-Vector product with negative accumulation: */
2019         SSE_INLINE_BEGIN_2(tmpx,v)
2020           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2021 
2022           /* First Column */
2023           SSE_COPY_PS(XMM0,XMM6)
2024           SSE_SHUFFLE(XMM0,XMM0,0x00)
2025           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2026           SSE_SUB_PS(XMM7,XMM0)
2027 
2028           /* Second Column */
2029           SSE_COPY_PS(XMM1,XMM6)
2030           SSE_SHUFFLE(XMM1,XMM1,0x55)
2031           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2032           SSE_SUB_PS(XMM7,XMM1)
2033 
2034           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2035 
2036           /* Third Column */
2037           SSE_COPY_PS(XMM2,XMM6)
2038           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2039           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2040           SSE_SUB_PS(XMM7,XMM2)
2041 
2042           /* Fourth Column */
2043           SSE_COPY_PS(XMM3,XMM6)
2044           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2045           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2046           SSE_SUB_PS(XMM7,XMM3)
2047         SSE_INLINE_END_2
2048 
2049         v  += 16;
2050       }
2051       idx = 4*i;
2052       v   = aa + 16*ai[++i];
2053       PREFETCH_NTA(v);
2054       STORE_PS(tmps,XMM7);
2055 
2056       /* Promote result from float to double */
2057       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2058     }
2059     /* backward solve the upper triangular */
2060     idt  = 4*(n-1);
2061     ai16 = 16*diag[n-1];
2062     v    = aa + ai16 + 16;
2063     for (i=n-1; i>=0;){
2064       PREFETCH_NTA(&v[8]);
2065       vi = aj + diag[i] + 1;
2066       nz = ai[i+1] - diag[i] - 1;
2067 
2068       /* Demote accumulator from double to float */
2069       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2070       LOAD_PS(tmps,XMM7);
2071 
2072       while (nz--) {
2073         PREFETCH_NTA(&v[16]);
2074         idx = 4*(*vi++);
2075 
2076         /* Demote solution (so far) from double to float */
2077         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2078 
2079         /* 4x4 Matrix-Vector Product with negative accumulation: */
2080         SSE_INLINE_BEGIN_2(tmpx,v)
2081           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2082 
2083           /* First Column */
2084           SSE_COPY_PS(XMM0,XMM6)
2085           SSE_SHUFFLE(XMM0,XMM0,0x00)
2086           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2087           SSE_SUB_PS(XMM7,XMM0)
2088 
2089           /* Second Column */
2090           SSE_COPY_PS(XMM1,XMM6)
2091           SSE_SHUFFLE(XMM1,XMM1,0x55)
2092           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2093           SSE_SUB_PS(XMM7,XMM1)
2094 
2095           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2096 
2097           /* Third Column */
2098           SSE_COPY_PS(XMM2,XMM6)
2099           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2100           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2101           SSE_SUB_PS(XMM7,XMM2)
2102 
2103           /* Fourth Column */
2104           SSE_COPY_PS(XMM3,XMM6)
2105           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2106           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2107           SSE_SUB_PS(XMM7,XMM3)
2108         SSE_INLINE_END_2
2109         v  += 16;
2110       }
2111       v    = aa + ai16;
2112       ai16 = 16*diag[--i];
2113       PREFETCH_NTA(aa+ai16+16);
2114       /*
2115          Scale the result by the diagonal 4x4 block,
2116          which was inverted as part of the factorization
2117       */
2118       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2119         /* First Column */
2120         SSE_COPY_PS(XMM0,XMM7)
2121         SSE_SHUFFLE(XMM0,XMM0,0x00)
2122         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2123 
2124         /* Second Column */
2125         SSE_COPY_PS(XMM1,XMM7)
2126         SSE_SHUFFLE(XMM1,XMM1,0x55)
2127         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2128         SSE_ADD_PS(XMM0,XMM1)
2129 
2130         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2131 
2132         /* Third Column */
2133         SSE_COPY_PS(XMM2,XMM7)
2134         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2135         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2136         SSE_ADD_PS(XMM0,XMM2)
2137 
2138         /* Fourth Column */
2139         SSE_COPY_PS(XMM3,XMM7)
2140         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2141         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2142         SSE_ADD_PS(XMM0,XMM3)
2143 
2144         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2145       SSE_INLINE_END_3
2146 
2147       /* Promote solution from float to double */
2148       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2149 
2150       /* Apply reordering to t and stream into x.    */
2151       /* This way, x doesn't pollute the cache.      */
2152       /* Be careful with size: 2 doubles = 4 floats! */
2153       idc  = 4*(*c--);
2154       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2155         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2156         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2157         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2158         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2159         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2160         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2161       SSE_INLINE_END_2
2162       v    = aa + ai16 + 16;
2163       idt -= 4;
2164     }
2165 
2166     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2167     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2168     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2169     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2170     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2171   SSE_SCOPE_END;
2172   PetscFunctionReturn(0);
2173 }
2174 
2175 #endif
2176 
2177 
2178 /*
2179       Special case where the matrix was ILU(0) factored in the natural
2180    ordering. This eliminates the need for the column and row permutation.
2181 */
2182 #undef __FUNCT__
2183 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2184 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2185 {
2186   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2187   PetscInt          n=a->mbs;
2188   const PetscInt    *ai=a->i,*aj=a->j;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *diag = a->diag;
2191   const MatScalar   *aa=a->a;
2192   PetscScalar       *x;
2193   const PetscScalar *b;
2194 
2195   PetscFunctionBegin;
2196   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2197   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2198 
2199 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2200   {
2201     static PetscScalar w[2000]; /* very BAD need to fix */
2202     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2203   }
2204 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2205   {
2206     static PetscScalar w[2000]; /* very BAD need to fix */
2207     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2208   }
2209 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2210   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2211 #else
2212   {
2213     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2214     const MatScalar *v;
2215     PetscInt        jdx,idt,idx,nz,i,ai16;
2216     const PetscInt  *vi;
2217 
2218   /* forward solve the lower triangular */
2219   idx    = 0;
2220   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2221   for (i=1; i<n; i++) {
2222     v     =  aa      + 16*ai[i];
2223     vi    =  aj      + ai[i];
2224     nz    =  diag[i] - ai[i];
2225     idx   +=  4;
2226     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2227     while (nz--) {
2228       jdx   = 4*(*vi++);
2229       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2230       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234       v    += 16;
2235     }
2236     x[idx]   = s1;
2237     x[1+idx] = s2;
2238     x[2+idx] = s3;
2239     x[3+idx] = s4;
2240   }
2241   /* backward solve the upper triangular */
2242   idt = 4*(n-1);
2243   for (i=n-1; i>=0; i--){
2244     ai16 = 16*diag[i];
2245     v    = aa + ai16 + 16;
2246     vi   = aj + diag[i] + 1;
2247     nz   = ai[i+1] - diag[i] - 1;
2248     s1 = x[idt];  s2 = x[1+idt];
2249     s3 = x[2+idt];s4 = x[3+idt];
2250     while (nz--) {
2251       idx   = 4*(*vi++);
2252       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2253       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2254       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2255       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2256       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2257       v    += 16;
2258     }
2259     v        = aa + ai16;
2260     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2261     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2262     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2263     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2264     idt -= 4;
2265   }
2266   }
2267 #endif
2268 
2269   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2270   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2271   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2272   PetscFunctionReturn(0);
2273 }
2274 
2275 #undef __FUNCT__
2276 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2277 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2278 {
2279   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2280   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2281   PetscErrorCode ierr;
2282   PetscInt       *diag = a->diag;
2283   MatScalar      *aa=a->a;
2284   PetscScalar    *x,*b;
2285 
2286   PetscFunctionBegin;
2287   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2288   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2289 
2290   {
2291     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2292     MatScalar  *v,*t=(MatScalar *)x;
2293     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2294 
2295     /* forward solve the lower triangular */
2296     idx  = 0;
2297     t[0] = (MatScalar)b[0];
2298     t[1] = (MatScalar)b[1];
2299     t[2] = (MatScalar)b[2];
2300     t[3] = (MatScalar)b[3];
2301     for (i=1; i<n; i++) {
2302       v     =  aa      + 16*ai[i];
2303       vi    =  aj      + ai[i];
2304       nz    =  diag[i] - ai[i];
2305       idx   +=  4;
2306       s1 = (MatScalar)b[idx];
2307       s2 = (MatScalar)b[1+idx];
2308       s3 = (MatScalar)b[2+idx];
2309       s4 = (MatScalar)b[3+idx];
2310       while (nz--) {
2311         jdx = 4*(*vi++);
2312         x1  = t[jdx];
2313         x2  = t[1+jdx];
2314         x3  = t[2+jdx];
2315         x4  = t[3+jdx];
2316         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2317         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2318         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2319         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2320         v    += 16;
2321       }
2322       t[idx]   = s1;
2323       t[1+idx] = s2;
2324       t[2+idx] = s3;
2325       t[3+idx] = s4;
2326     }
2327     /* backward solve the upper triangular */
2328     idt = 4*(n-1);
2329     for (i=n-1; i>=0; i--){
2330       ai16 = 16*diag[i];
2331       v    = aa + ai16 + 16;
2332       vi   = aj + diag[i] + 1;
2333       nz   = ai[i+1] - diag[i] - 1;
2334       s1   = t[idt];
2335       s2   = t[1+idt];
2336       s3   = t[2+idt];
2337       s4   = t[3+idt];
2338       while (nz--) {
2339         idx = 4*(*vi++);
2340         x1  = (MatScalar)x[idx];
2341         x2  = (MatScalar)x[1+idx];
2342         x3  = (MatScalar)x[2+idx];
2343         x4  = (MatScalar)x[3+idx];
2344         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2345         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2346         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2347         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2348         v    += 16;
2349       }
2350       v        = aa + ai16;
2351       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2352       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2353       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2354       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2355       idt -= 4;
2356     }
2357   }
2358 
2359   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2362   PetscFunctionReturn(0);
2363 }
2364 
2365 #if defined (PETSC_HAVE_SSE)
2366 
2367 #include PETSC_HAVE_SSE
2368 #undef __FUNCT__
2369 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2370 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2371 {
2372   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2373   unsigned short *aj=(unsigned short *)a->j;
2374   PetscErrorCode ierr;
2375   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2376   MatScalar      *aa=a->a;
2377   PetscScalar    *x,*b;
2378 
2379   PetscFunctionBegin;
2380   SSE_SCOPE_BEGIN;
2381   /*
2382      Note: This code currently uses demotion of double
2383      to float when performing the mixed-mode computation.
2384      This may not be numerically reasonable for all applications.
2385   */
2386   PREFETCH_NTA(aa+16*ai[1]);
2387 
2388   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2389   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2390   {
2391     /* x will first be computed in single precision then promoted inplace to double */
2392     MatScalar      *v,*t=(MatScalar *)x;
2393     int            nz,i,idt,ai16;
2394     unsigned int   jdx,idx;
2395     unsigned short *vi;
2396     /* Forward solve the lower triangular factor. */
2397 
2398     /* First block is the identity. */
2399     idx  = 0;
2400     CONVERT_DOUBLE4_FLOAT4(t,b);
2401     v    =  aa + 16*((unsigned int)ai[1]);
2402 
2403     for (i=1; i<n;) {
2404       PREFETCH_NTA(&v[8]);
2405       vi   =  aj      + ai[i];
2406       nz   =  diag[i] - ai[i];
2407       idx +=  4;
2408 
2409       /* Demote RHS from double to float. */
2410       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2411       LOAD_PS(&t[idx],XMM7);
2412 
2413       while (nz--) {
2414         PREFETCH_NTA(&v[16]);
2415         jdx = 4*((unsigned int)(*vi++));
2416 
2417         /* 4x4 Matrix-Vector product with negative accumulation: */
2418         SSE_INLINE_BEGIN_2(&t[jdx],v)
2419           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2420 
2421           /* First Column */
2422           SSE_COPY_PS(XMM0,XMM6)
2423           SSE_SHUFFLE(XMM0,XMM0,0x00)
2424           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2425           SSE_SUB_PS(XMM7,XMM0)
2426 
2427           /* Second Column */
2428           SSE_COPY_PS(XMM1,XMM6)
2429           SSE_SHUFFLE(XMM1,XMM1,0x55)
2430           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2431           SSE_SUB_PS(XMM7,XMM1)
2432 
2433           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2434 
2435           /* Third Column */
2436           SSE_COPY_PS(XMM2,XMM6)
2437           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2438           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2439           SSE_SUB_PS(XMM7,XMM2)
2440 
2441           /* Fourth Column */
2442           SSE_COPY_PS(XMM3,XMM6)
2443           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2444           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2445           SSE_SUB_PS(XMM7,XMM3)
2446         SSE_INLINE_END_2
2447 
2448         v  += 16;
2449       }
2450       v    =  aa + 16*ai[++i];
2451       PREFETCH_NTA(v);
2452       STORE_PS(&t[idx],XMM7);
2453     }
2454 
2455     /* Backward solve the upper triangular factor.*/
2456 
2457     idt  = 4*(n-1);
2458     ai16 = 16*diag[n-1];
2459     v    = aa + ai16 + 16;
2460     for (i=n-1; i>=0;){
2461       PREFETCH_NTA(&v[8]);
2462       vi = aj + diag[i] + 1;
2463       nz = ai[i+1] - diag[i] - 1;
2464 
2465       LOAD_PS(&t[idt],XMM7);
2466 
2467       while (nz--) {
2468         PREFETCH_NTA(&v[16]);
2469         idx = 4*((unsigned int)(*vi++));
2470 
2471         /* 4x4 Matrix-Vector Product with negative accumulation: */
2472         SSE_INLINE_BEGIN_2(&t[idx],v)
2473           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2474 
2475           /* First Column */
2476           SSE_COPY_PS(XMM0,XMM6)
2477           SSE_SHUFFLE(XMM0,XMM0,0x00)
2478           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2479           SSE_SUB_PS(XMM7,XMM0)
2480 
2481           /* Second Column */
2482           SSE_COPY_PS(XMM1,XMM6)
2483           SSE_SHUFFLE(XMM1,XMM1,0x55)
2484           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2485           SSE_SUB_PS(XMM7,XMM1)
2486 
2487           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2488 
2489           /* Third Column */
2490           SSE_COPY_PS(XMM2,XMM6)
2491           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2492           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2493           SSE_SUB_PS(XMM7,XMM2)
2494 
2495           /* Fourth Column */
2496           SSE_COPY_PS(XMM3,XMM6)
2497           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2498           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2499           SSE_SUB_PS(XMM7,XMM3)
2500         SSE_INLINE_END_2
2501         v  += 16;
2502       }
2503       v    = aa + ai16;
2504       ai16 = 16*diag[--i];
2505       PREFETCH_NTA(aa+ai16+16);
2506       /*
2507          Scale the result by the diagonal 4x4 block,
2508          which was inverted as part of the factorization
2509       */
2510       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2511         /* First Column */
2512         SSE_COPY_PS(XMM0,XMM7)
2513         SSE_SHUFFLE(XMM0,XMM0,0x00)
2514         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2515 
2516         /* Second Column */
2517         SSE_COPY_PS(XMM1,XMM7)
2518         SSE_SHUFFLE(XMM1,XMM1,0x55)
2519         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2520         SSE_ADD_PS(XMM0,XMM1)
2521 
2522         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2523 
2524         /* Third Column */
2525         SSE_COPY_PS(XMM2,XMM7)
2526         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2527         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2528         SSE_ADD_PS(XMM0,XMM2)
2529 
2530         /* Fourth Column */
2531         SSE_COPY_PS(XMM3,XMM7)
2532         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2533         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2534         SSE_ADD_PS(XMM0,XMM3)
2535 
2536         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2537       SSE_INLINE_END_3
2538 
2539       v    = aa + ai16 + 16;
2540       idt -= 4;
2541     }
2542 
2543     /* Convert t from single precision back to double precision (inplace)*/
2544     idt = 4*(n-1);
2545     for (i=n-1;i>=0;i--) {
2546       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2547       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2548       PetscScalar *xtemp=&x[idt];
2549       MatScalar   *ttemp=&t[idt];
2550       xtemp[3] = (PetscScalar)ttemp[3];
2551       xtemp[2] = (PetscScalar)ttemp[2];
2552       xtemp[1] = (PetscScalar)ttemp[1];
2553       xtemp[0] = (PetscScalar)ttemp[0];
2554       idt -= 4;
2555     }
2556 
2557   } /* End of artificial scope. */
2558   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2559   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2560   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2561   SSE_SCOPE_END;
2562   PetscFunctionReturn(0);
2563 }
2564 
2565 #undef __FUNCT__
2566 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2567 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2568 {
2569   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2570   int            *aj=a->j;
2571   PetscErrorCode ierr;
2572   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2573   MatScalar      *aa=a->a;
2574   PetscScalar    *x,*b;
2575 
2576   PetscFunctionBegin;
2577   SSE_SCOPE_BEGIN;
2578   /*
2579      Note: This code currently uses demotion of double
2580      to float when performing the mixed-mode computation.
2581      This may not be numerically reasonable for all applications.
2582   */
2583   PREFETCH_NTA(aa+16*ai[1]);
2584 
2585   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2586   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2587   {
2588     /* x will first be computed in single precision then promoted inplace to double */
2589     MatScalar *v,*t=(MatScalar *)x;
2590     int       nz,i,idt,ai16;
2591     int       jdx,idx;
2592     int       *vi;
2593     /* Forward solve the lower triangular factor. */
2594 
2595     /* First block is the identity. */
2596     idx  = 0;
2597     CONVERT_DOUBLE4_FLOAT4(t,b);
2598     v    =  aa + 16*ai[1];
2599 
2600     for (i=1; i<n;) {
2601       PREFETCH_NTA(&v[8]);
2602       vi   =  aj      + ai[i];
2603       nz   =  diag[i] - ai[i];
2604       idx +=  4;
2605 
2606       /* Demote RHS from double to float. */
2607       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2608       LOAD_PS(&t[idx],XMM7);
2609 
2610       while (nz--) {
2611         PREFETCH_NTA(&v[16]);
2612         jdx = 4*(*vi++);
2613 /*          jdx = *vi++; */
2614 
2615         /* 4x4 Matrix-Vector product with negative accumulation: */
2616         SSE_INLINE_BEGIN_2(&t[jdx],v)
2617           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2618 
2619           /* First Column */
2620           SSE_COPY_PS(XMM0,XMM6)
2621           SSE_SHUFFLE(XMM0,XMM0,0x00)
2622           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2623           SSE_SUB_PS(XMM7,XMM0)
2624 
2625           /* Second Column */
2626           SSE_COPY_PS(XMM1,XMM6)
2627           SSE_SHUFFLE(XMM1,XMM1,0x55)
2628           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2629           SSE_SUB_PS(XMM7,XMM1)
2630 
2631           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2632 
2633           /* Third Column */
2634           SSE_COPY_PS(XMM2,XMM6)
2635           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2636           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2637           SSE_SUB_PS(XMM7,XMM2)
2638 
2639           /* Fourth Column */
2640           SSE_COPY_PS(XMM3,XMM6)
2641           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2642           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2643           SSE_SUB_PS(XMM7,XMM3)
2644         SSE_INLINE_END_2
2645 
2646         v  += 16;
2647       }
2648       v    =  aa + 16*ai[++i];
2649       PREFETCH_NTA(v);
2650       STORE_PS(&t[idx],XMM7);
2651     }
2652 
2653     /* Backward solve the upper triangular factor.*/
2654 
2655     idt  = 4*(n-1);
2656     ai16 = 16*diag[n-1];
2657     v    = aa + ai16 + 16;
2658     for (i=n-1; i>=0;){
2659       PREFETCH_NTA(&v[8]);
2660       vi = aj + diag[i] + 1;
2661       nz = ai[i+1] - diag[i] - 1;
2662 
2663       LOAD_PS(&t[idt],XMM7);
2664 
2665       while (nz--) {
2666         PREFETCH_NTA(&v[16]);
2667         idx = 4*(*vi++);
2668 /*          idx = *vi++; */
2669 
2670         /* 4x4 Matrix-Vector Product with negative accumulation: */
2671         SSE_INLINE_BEGIN_2(&t[idx],v)
2672           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2673 
2674           /* First Column */
2675           SSE_COPY_PS(XMM0,XMM6)
2676           SSE_SHUFFLE(XMM0,XMM0,0x00)
2677           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2678           SSE_SUB_PS(XMM7,XMM0)
2679 
2680           /* Second Column */
2681           SSE_COPY_PS(XMM1,XMM6)
2682           SSE_SHUFFLE(XMM1,XMM1,0x55)
2683           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2684           SSE_SUB_PS(XMM7,XMM1)
2685 
2686           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2687 
2688           /* Third Column */
2689           SSE_COPY_PS(XMM2,XMM6)
2690           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2691           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2692           SSE_SUB_PS(XMM7,XMM2)
2693 
2694           /* Fourth Column */
2695           SSE_COPY_PS(XMM3,XMM6)
2696           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2697           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2698           SSE_SUB_PS(XMM7,XMM3)
2699         SSE_INLINE_END_2
2700         v  += 16;
2701       }
2702       v    = aa + ai16;
2703       ai16 = 16*diag[--i];
2704       PREFETCH_NTA(aa+ai16+16);
2705       /*
2706          Scale the result by the diagonal 4x4 block,
2707          which was inverted as part of the factorization
2708       */
2709       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2710         /* First Column */
2711         SSE_COPY_PS(XMM0,XMM7)
2712         SSE_SHUFFLE(XMM0,XMM0,0x00)
2713         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2714 
2715         /* Second Column */
2716         SSE_COPY_PS(XMM1,XMM7)
2717         SSE_SHUFFLE(XMM1,XMM1,0x55)
2718         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2719         SSE_ADD_PS(XMM0,XMM1)
2720 
2721         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2722 
2723         /* Third Column */
2724         SSE_COPY_PS(XMM2,XMM7)
2725         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2726         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2727         SSE_ADD_PS(XMM0,XMM2)
2728 
2729         /* Fourth Column */
2730         SSE_COPY_PS(XMM3,XMM7)
2731         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2732         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2733         SSE_ADD_PS(XMM0,XMM3)
2734 
2735         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2736       SSE_INLINE_END_3
2737 
2738       v    = aa + ai16 + 16;
2739       idt -= 4;
2740     }
2741 
2742     /* Convert t from single precision back to double precision (inplace)*/
2743     idt = 4*(n-1);
2744     for (i=n-1;i>=0;i--) {
2745       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2746       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2747       PetscScalar *xtemp=&x[idt];
2748       MatScalar   *ttemp=&t[idt];
2749       xtemp[3] = (PetscScalar)ttemp[3];
2750       xtemp[2] = (PetscScalar)ttemp[2];
2751       xtemp[1] = (PetscScalar)ttemp[1];
2752       xtemp[0] = (PetscScalar)ttemp[0];
2753       idt -= 4;
2754     }
2755 
2756   } /* End of artificial scope. */
2757   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2758   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2759   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2760   SSE_SCOPE_END;
2761   PetscFunctionReturn(0);
2762 }
2763 
2764 #endif
2765 #undef __FUNCT__
2766 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2767 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2768 {
2769   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2770   IS                iscol=a->col,isrow=a->row;
2771   PetscErrorCode    ierr;
2772   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2773   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2774   const MatScalar   *aa=a->a,*v;
2775   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2776   const PetscScalar *b;
2777 
2778   PetscFunctionBegin;
2779   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2780   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2781   t  = a->solve_work;
2782 
2783   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2784   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2785 
2786   /* forward solve the lower triangular */
2787   idx    = 3*(*r++);
2788   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2789   for (i=1; i<n; i++) {
2790     v     = aa + 9*ai[i];
2791     vi    = aj + ai[i];
2792     nz    = diag[i] - ai[i];
2793     idx   = 3*(*r++);
2794     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2795     while (nz--) {
2796       idx   = 3*(*vi++);
2797       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2798       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2801       v += 9;
2802     }
2803     idx = 3*i;
2804     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2805   }
2806   /* backward solve the upper triangular */
2807   for (i=n-1; i>=0; i--){
2808     v    = aa + 9*diag[i] + 9;
2809     vi   = aj + diag[i] + 1;
2810     nz   = ai[i+1] - diag[i] - 1;
2811     idt  = 3*i;
2812     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2813     while (nz--) {
2814       idx   = 3*(*vi++);
2815       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2816       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2817       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2818       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2819       v += 9;
2820     }
2821     idc = 3*(*c--);
2822     v   = aa + 9*diag[i];
2823     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2824     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2825     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2826   }
2827   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2828   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2829   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2830   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2831   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2832   PetscFunctionReturn(0);
2833 }
2834 
2835 /*
2836       Special case where the matrix was ILU(0) factored in the natural
2837    ordering. This eliminates the need for the column and row permutation.
2838 */
2839 #undef __FUNCT__
2840 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2841 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2842 {
2843   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2844   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2845   PetscErrorCode    ierr;
2846   PetscInt          *diag = a->diag;
2847   const MatScalar   *aa=a->a,*v;
2848   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2849   const PetscScalar *b;
2850   PetscInt          jdx,idt,idx,nz,*vi,i;
2851 
2852   PetscFunctionBegin;
2853   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2854   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2855 
2856   /* forward solve the lower triangular */
2857   idx    = 0;
2858   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2859   for (i=1; i<n; i++) {
2860     v     =  aa      + 9*ai[i];
2861     vi    =  aj      + ai[i];
2862     nz    =  diag[i] - ai[i];
2863     idx   +=  3;
2864     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2865     while (nz--) {
2866       jdx   = 3*(*vi++);
2867       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2868       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2869       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2870       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2871       v    += 9;
2872     }
2873     x[idx]   = s1;
2874     x[1+idx] = s2;
2875     x[2+idx] = s3;
2876   }
2877   /* backward solve the upper triangular */
2878   for (i=n-1; i>=0; i--){
2879     v    = aa + 9*diag[i] + 9;
2880     vi   = aj + diag[i] + 1;
2881     nz   = ai[i+1] - diag[i] - 1;
2882     idt  = 3*i;
2883     s1 = x[idt];  s2 = x[1+idt];
2884     s3 = x[2+idt];
2885     while (nz--) {
2886       idx   = 3*(*vi++);
2887       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2888       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2889       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2890       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2891       v    += 9;
2892     }
2893     v        = aa +  9*diag[i];
2894     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2895     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2896     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2897   }
2898 
2899   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2900   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2901   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2902   PetscFunctionReturn(0);
2903 }
2904 
2905 #undef __FUNCT__
2906 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2907 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2908 {
2909   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2910   IS                iscol=a->col,isrow=a->row;
2911   PetscErrorCode    ierr;
2912   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2913   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2914   const MatScalar   *aa=a->a,*v;
2915   PetscScalar       *x,s1,s2,x1,x2,*t;
2916   const PetscScalar *b;
2917 
2918   PetscFunctionBegin;
2919   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2920   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2921   t  = a->solve_work;
2922 
2923   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2924   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2925 
2926   /* forward solve the lower triangular */
2927   idx    = 2*(*r++);
2928   t[0] = b[idx]; t[1] = b[1+idx];
2929   for (i=1; i<n; i++) {
2930     v     = aa + 4*ai[i];
2931     vi    = aj + ai[i];
2932     nz    = diag[i] - ai[i];
2933     idx   = 2*(*r++);
2934     s1  = b[idx]; s2 = b[1+idx];
2935     while (nz--) {
2936       idx   = 2*(*vi++);
2937       x1    = t[idx]; x2 = t[1+idx];
2938       s1 -= v[0]*x1 + v[2]*x2;
2939       s2 -= v[1]*x1 + v[3]*x2;
2940       v += 4;
2941     }
2942     idx = 2*i;
2943     t[idx] = s1; t[1+idx] = s2;
2944   }
2945   /* backward solve the upper triangular */
2946   for (i=n-1; i>=0; i--){
2947     v    = aa + 4*diag[i] + 4;
2948     vi   = aj + diag[i] + 1;
2949     nz   = ai[i+1] - diag[i] - 1;
2950     idt  = 2*i;
2951     s1 = t[idt]; s2 = t[1+idt];
2952     while (nz--) {
2953       idx   = 2*(*vi++);
2954       x1    = t[idx]; x2 = t[1+idx];
2955       s1 -= v[0]*x1 + v[2]*x2;
2956       s2 -= v[1]*x1 + v[3]*x2;
2957       v += 4;
2958     }
2959     idc = 2*(*c--);
2960     v   = aa + 4*diag[i];
2961     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2962     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2963   }
2964   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2965   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2966   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2967   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2968   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
2969   PetscFunctionReturn(0);
2970 }
2971 
2972 /*
2973       Special case where the matrix was ILU(0) factored in the natural
2974    ordering. This eliminates the need for the column and row permutation.
2975 */
2976 #undef __FUNCT__
2977 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2978 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2979 {
2980   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2981   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2982   PetscErrorCode    ierr;
2983   PetscInt          *diag = a->diag;
2984   const MatScalar   *aa=a->a,*v;
2985   PetscScalar       *x,s1,s2,x1,x2;
2986   const PetscScalar *b;
2987   PetscInt          jdx,idt,idx,nz,*vi,i;
2988 
2989   PetscFunctionBegin;
2990   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2991   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2992 
2993   /* forward solve the lower triangular */
2994   idx    = 0;
2995   x[0]   = b[0]; x[1] = b[1];
2996   for (i=1; i<n; i++) {
2997     v     =  aa      + 4*ai[i];
2998     vi    =  aj      + ai[i];
2999     nz    =  diag[i] - ai[i];
3000     idx   +=  2;
3001     s1  =  b[idx];s2 = b[1+idx];
3002     while (nz--) {
3003       jdx   = 2*(*vi++);
3004       x1    = x[jdx];x2 = x[1+jdx];
3005       s1 -= v[0]*x1 + v[2]*x2;
3006       s2 -= v[1]*x1 + v[3]*x2;
3007       v    += 4;
3008     }
3009     x[idx]   = s1;
3010     x[1+idx] = s2;
3011   }
3012   /* backward solve the upper triangular */
3013   for (i=n-1; i>=0; i--){
3014     v    = aa + 4*diag[i] + 4;
3015     vi   = aj + diag[i] + 1;
3016     nz   = ai[i+1] - diag[i] - 1;
3017     idt  = 2*i;
3018     s1 = x[idt];  s2 = x[1+idt];
3019     while (nz--) {
3020       idx   = 2*(*vi++);
3021       x1    = x[idx];   x2 = x[1+idx];
3022       s1 -= v[0]*x1 + v[2]*x2;
3023       s2 -= v[1]*x1 + v[3]*x2;
3024       v    += 4;
3025     }
3026     v        = aa +  4*diag[i];
3027     x[idt]   = v[0]*s1 + v[2]*s2;
3028     x[1+idt] = v[1]*s1 + v[3]*s2;
3029   }
3030 
3031   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3032   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3033   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3034   PetscFunctionReturn(0);
3035 }
3036 
3037 #undef __FUNCT__
3038 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3039 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
3040 {
3041   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3042   IS             iscol=a->col,isrow=a->row;
3043   PetscErrorCode ierr;
3044   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3045   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3046   MatScalar      *aa=a->a,*v;
3047   PetscScalar    *x,*b,s1,*t;
3048 
3049   PetscFunctionBegin;
3050   if (!n) PetscFunctionReturn(0);
3051 
3052   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3053   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3054   t  = a->solve_work;
3055 
3056   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3057   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3058 
3059   /* forward solve the lower triangular */
3060   t[0] = b[*r++];
3061   for (i=1; i<n; i++) {
3062     v     = aa + ai[i];
3063     vi    = aj + ai[i];
3064     nz    = diag[i] - ai[i];
3065     s1  = b[*r++];
3066     while (nz--) {
3067       s1 -= (*v++)*t[*vi++];
3068     }
3069     t[i] = s1;
3070   }
3071   /* backward solve the upper triangular */
3072   for (i=n-1; i>=0; i--){
3073     v    = aa + diag[i] + 1;
3074     vi   = aj + diag[i] + 1;
3075     nz   = ai[i+1] - diag[i] - 1;
3076     s1 = t[i];
3077     while (nz--) {
3078       s1 -= (*v++)*t[*vi++];
3079     }
3080     x[*c--] = t[i] = aa[diag[i]]*s1;
3081   }
3082 
3083   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3084   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3085   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3086   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3087   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3088   PetscFunctionReturn(0);
3089 }
3090 /*
3091       Special case where the matrix was ILU(0) factored in the natural
3092    ordering. This eliminates the need for the column and row permutation.
3093 */
3094 #undef __FUNCT__
3095 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3096 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3097 {
3098   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3099   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3100   PetscErrorCode ierr;
3101   PetscInt       *diag = a->diag;
3102   MatScalar      *aa=a->a;
3103   PetscScalar    *x,*b;
3104   PetscScalar    s1,x1;
3105   MatScalar      *v;
3106   PetscInt       jdx,idt,idx,nz,*vi,i;
3107 
3108   PetscFunctionBegin;
3109   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3110   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3111 
3112   /* forward solve the lower triangular */
3113   idx    = 0;
3114   x[0]   = b[0];
3115   for (i=1; i<n; i++) {
3116     v     =  aa      + ai[i];
3117     vi    =  aj      + ai[i];
3118     nz    =  diag[i] - ai[i];
3119     idx   +=  1;
3120     s1  =  b[idx];
3121     while (nz--) {
3122       jdx   = *vi++;
3123       x1    = x[jdx];
3124       s1 -= v[0]*x1;
3125       v    += 1;
3126     }
3127     x[idx]   = s1;
3128   }
3129   /* backward solve the upper triangular */
3130   for (i=n-1; i>=0; i--){
3131     v    = aa + diag[i] + 1;
3132     vi   = aj + diag[i] + 1;
3133     nz   = ai[i+1] - diag[i] - 1;
3134     idt  = i;
3135     s1 = x[idt];
3136     while (nz--) {
3137       idx   = *vi++;
3138       x1    = x[idx];
3139       s1 -= v[0]*x1;
3140       v    += 1;
3141     }
3142     v        = aa +  diag[i];
3143     x[idt]   = v[0]*s1;
3144   }
3145   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3146   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3147   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3148   PetscFunctionReturn(0);
3149 }
3150 
3151 /* ----------------------------------------------------------------*/
3152 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3153 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3154 
3155 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
3156 #undef __FUNCT__
3157 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
3158 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
3159 {
3160   Mat            C=B;
3161   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
3162   IS             isrow = b->row,isicol = b->icol;
3163   PetscErrorCode ierr;
3164   const PetscInt *r,*ic,*ics;
3165   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
3166   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3167   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3168   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3169   MatScalar      *v_work;
3170 
3171   PetscFunctionBegin;
3172   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3173   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3174   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3175   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
3176   ics  = ic;
3177 
3178   /* generate work space needed by dense LU factorization */
3179   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3180   multiplier = v_work + bs;
3181   v_pivots   = (PetscInt*)(multiplier + bs2);
3182 
3183   for (i=0; i<n; i++){
3184     /* zero rtmp */
3185     /* L part */
3186     nz    = bi[i+1] - bi[i];
3187     bjtmp = bj + bi[i];
3188     for  (j=0; j<nz; j++){
3189       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3190     }
3191 
3192     /* U part */
3193     nz = bi[2*n-i+1] - bi[2*n-i];
3194     bjtmp = bj + bi[2*n-i];
3195     for  (j=0; j<nz; j++){
3196       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3197     }
3198 
3199     /* load in initial (unfactored row) */
3200     nz    = ai[r[i]+1] - ai[r[i]];
3201     ajtmp = aj + ai[r[i]];
3202     v     = aa + bs2*ai[r[i]];
3203     for (j=0; j<nz; j++) {
3204       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
3205     }
3206 
3207     /* elimination */
3208     bjtmp = bj + bi[i];
3209     row   = *bjtmp++;
3210     nzL   = bi[i+1] - bi[i];
3211     k   = 0;
3212     while  (k < nzL) {
3213       pc = rtmp + bs2*row;
3214       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3215       if (flg) {
3216         pv         = b->a + bs2*bdiag[row];
3217         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
3218         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3219         pv         = b->a + bs2*bi[2*n-row];
3220         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3221         for (j=0; j<nz; j++) {
3222           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3223         }
3224         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
3225       }
3226       row = *bjtmp++; k++;
3227     }
3228 
3229     /* finished row so stick it into b->a */
3230     /* L part */
3231     pv   = b->a + bs2*bi[i] ;
3232     pj   = b->j + bi[i] ;
3233     nz   = bi[i+1] - bi[i];
3234     for (j=0; j<nz; j++) {
3235       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3236     }
3237 
3238     /* Mark diagonal and invert diagonal for simplier triangular solves */
3239     pv  = b->a + bs2*bdiag[i];
3240     pj  = b->j + bdiag[i];
3241     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3242     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3243     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
3244 
3245     /* U part */
3246     pv = b->a + bs2*bi[2*n-i];
3247     pj = b->j + bi[2*n-i];
3248     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3249     for (j=0; j<nz; j++){
3250       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3251     }
3252   }
3253 
3254   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3255   ierr = PetscFree(v_work);CHKERRQ(ierr);
3256   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3257   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3258   if (bs == 5){
3259     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3260   } else {
3261     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3262   }
3263   C->assembled = PETSC_TRUE;
3264   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
3265   PetscFunctionReturn(0);
3266 }
3267 
3268 /*
3269    ilu(0) with natural ordering under new data structure.
3270    Factored arrays bj and ba are stored as
3271      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
3272 
3273    bi=fact->i is an array of size 2n+2, in which
3274    bi+
3275      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
3276      bi[n]      ->  end of L(n-1,:)+1
3277      bi[n+1]    ->  1st entry of U(n-1,:)
3278      bi[2n-i]   ->  1st entry of U(i,:)
3279      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
3280      bi[2n]     ->  end of U(0,:)+1
3281 
3282    U(i,:) contains diag[i] as its last entry, i.e.,
3283     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
3284 */
3285 #undef __FUNCT__
3286 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
3287 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3288 {
3289 
3290   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3291   PetscErrorCode     ierr;
3292   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
3293   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
3294 
3295   PetscFunctionBegin;
3296   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3297   b     = (Mat_SeqBAIJ*)(fact)->data;
3298   bdiag = b->diag;
3299 
3300   /* replace matrix arrays with single allocations, then reset values */
3301   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
3302 
3303   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
3304   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
3305   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
3306   b->singlemalloc = PETSC_FALSE;
3307   if (mbs > 0) {
3308     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
3309   }
3310 
3311   /* set bi and bj with new data structure */
3312   bi = b->i;
3313   bj = b->j;
3314 
3315   /* L part */
3316   bi[0] = 0;
3317   for (i=0; i<mbs; i++){
3318     nz = adiag[i] - ai[i];
3319     bi[i+1] = bi[i] + nz;
3320     aj = a->j + ai[i];
3321     for (j=0; j<nz; j++){
3322       *bj = aj[j]; bj++;
3323     }
3324   }
3325 
3326   /* U part */
3327   bi[mbs+1] = bi[mbs];
3328   for (i=mbs-1; i>=0; i--){
3329     nz = ai[i+1] - adiag[i] - 1;
3330     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3331     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
3332     aj = a->j + adiag[i] + 1;
3333     for (j=0; j<nz; j++){
3334       *bj = aj[j]; bj++;
3335     }
3336     /* diag[i] */
3337     *bj = i; bj++;
3338     bdiag[i] = bi[2*mbs-i+1]-1;
3339   }
3340   PetscFunctionReturn(0);
3341 }
3342 
3343 /*
3344      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3345    except that the data structure of Mat_SeqAIJ is slightly different.
3346    Not a good example of code reuse.
3347 */
3348 
3349 #undef __FUNCT__
3350 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3351 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3352 {
3353   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3354   IS             isicol;
3355   PetscErrorCode ierr;
3356   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3357   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3358   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3359   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3360   PetscTruth     col_identity,row_identity,both_identity,flg;
3361   PetscReal      f;
3362 
3363   PetscFunctionBegin;
3364   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
3365   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
3366 
3367   f             = info->fill;
3368   levels        = (PetscInt)info->levels;
3369   diagonal_fill = (PetscInt)info->diagonal_fill;
3370   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3371   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3372   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3373   both_identity = (PetscTruth) (row_identity && col_identity);
3374 
3375   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3376 
3377     PetscTruth newdatastruct=PETSC_FALSE;
3378     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
3379     if (newdatastruct){
3380       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
3381       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3382     } else {
3383       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3384       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3385     }
3386 
3387     fact->factor = MAT_FACTOR_ILU;
3388     b            = (Mat_SeqBAIJ*)(fact)->data;
3389     b->row       = isrow;
3390     b->col       = iscol;
3391     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3392     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3393     b->icol      = isicol;
3394     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3395     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3396     PetscFunctionReturn(0);
3397   }
3398 
3399   /* general case perform the symbolic factorization */
3400     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3401     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3402 
3403     /* get new row pointers */
3404     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3405     ainew[0] = 0;
3406     /* don't know how many column pointers are needed so estimate */
3407     jmax = (PetscInt)(f*ai[n] + 1);
3408     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3409     /* ajfill is level of fill for each fill entry */
3410     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3411     /* fill is a linked list of nonzeros in active row */
3412     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3413     /* im is level for each filled value */
3414     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3415     /* dloc is location of diagonal in factor */
3416     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3417     dloc[0]  = 0;
3418     for (prow=0; prow<n; prow++) {
3419 
3420       /* copy prow into linked list */
3421       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3422       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
3423       xi         = aj + ai[r[prow]];
3424       fill[n]    = n;
3425       fill[prow] = -1; /* marker for diagonal entry */
3426       while (nz--) {
3427 	fm  = n;
3428 	idx = ic[*xi++];
3429 	do {
3430 	  m  = fm;
3431 	  fm = fill[m];
3432 	} while (fm < idx);
3433 	fill[m]   = idx;
3434 	fill[idx] = fm;
3435 	im[idx]   = 0;
3436       }
3437 
3438       /* make sure diagonal entry is included */
3439       if (diagonal_fill && fill[prow] == -1) {
3440 	fm = n;
3441 	while (fill[fm] < prow) fm = fill[fm];
3442 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3443 	fill[fm]   = prow;
3444 	im[prow]   = 0;
3445 	nzf++;
3446 	dcount++;
3447       }
3448 
3449       nzi = 0;
3450       row = fill[n];
3451       while (row < prow) {
3452 	incrlev = im[row] + 1;
3453 	nz      = dloc[row];
3454 	xi      = ajnew  + ainew[row] + nz + 1;
3455 	flev    = ajfill + ainew[row] + nz + 1;
3456 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3457 	fm      = row;
3458 	while (nnz-- > 0) {
3459 	  idx = *xi++;
3460 	  if (*flev + incrlev > levels) {
3461 	    flev++;
3462 	    continue;
3463 	  }
3464 	  do {
3465 	    m  = fm;
3466 	    fm = fill[m];
3467 	  } while (fm < idx);
3468 	  if (fm != idx) {
3469 	    im[idx]   = *flev + incrlev;
3470 	    fill[m]   = idx;
3471 	    fill[idx] = fm;
3472 	    fm        = idx;
3473 	    nzf++;
3474 	  } else {
3475 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3476 	  }
3477 	  flev++;
3478 	}
3479 	row = fill[row];
3480 	nzi++;
3481       }
3482       /* copy new filled row into permanent storage */
3483       ainew[prow+1] = ainew[prow] + nzf;
3484       if (ainew[prow+1] > jmax) {
3485 
3486 	/* estimate how much additional space we will need */
3487 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3488 	/* just double the memory each time */
3489 	PetscInt maxadd = jmax;
3490 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3491 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3492 	jmax += maxadd;
3493 
3494 	/* allocate a longer ajnew and ajfill */
3495 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3496 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3497 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3498 	ajnew = xitmp;
3499 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3500 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3501 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3502 	ajfill = xitmp;
3503 	reallocate++; /* count how many reallocations are needed */
3504       }
3505       xitmp       = ajnew + ainew[prow];
3506       flev        = ajfill + ainew[prow];
3507       dloc[prow]  = nzi;
3508       fm          = fill[n];
3509       while (nzf--) {
3510 	*xitmp++ = fm;
3511 	*flev++ = im[fm];
3512 	fm      = fill[fm];
3513       }
3514       /* make sure row has diagonal entry */
3515       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3516 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3517     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3518       }
3519     }
3520     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3521     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3522     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3523     ierr = PetscFree(fill);CHKERRQ(ierr);
3524     ierr = PetscFree(im);CHKERRQ(ierr);
3525 
3526 #if defined(PETSC_USE_INFO)
3527     {
3528       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3529       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3530       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3531       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3532       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3533       if (diagonal_fill) {
3534 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3535       }
3536     }
3537 #endif
3538 
3539     /* put together the new matrix */
3540     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3541     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3542     b    = (Mat_SeqBAIJ*)(fact)->data;
3543     b->free_a       = PETSC_TRUE;
3544     b->free_ij      = PETSC_TRUE;
3545     b->singlemalloc = PETSC_FALSE;
3546     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3547     b->j          = ajnew;
3548     b->i          = ainew;
3549     for (i=0; i<n; i++) dloc[i] += ainew[i];
3550     b->diag       = dloc;
3551     b->ilen       = 0;
3552     b->imax       = 0;
3553     b->row        = isrow;
3554     b->col        = iscol;
3555     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3556     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3557     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3558     b->icol       = isicol;
3559     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3560     /* In b structure:  Free imax, ilen, old a, old j.
3561        Allocate dloc, solve_work, new a, new j */
3562     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3563     b->maxnz          = b->nz = ainew[n];
3564 
3565     (fact)->info.factor_mallocs    = reallocate;
3566     (fact)->info.fill_ratio_given  = f;
3567     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3568 
3569   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3570   PetscFunctionReturn(0);
3571 }
3572 
3573 #undef __FUNCT__
3574 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3575 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3576 {
3577   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3578   /* int i,*AJ=a->j,nz=a->nz; */
3579   PetscFunctionBegin;
3580   /* Undo Column scaling */
3581 /*    while (nz--) { */
3582 /*      AJ[i] = AJ[i]/4; */
3583 /*    } */
3584   /* This should really invoke a push/pop logic, but we don't have that yet. */
3585   A->ops->setunfactored = PETSC_NULL;
3586   PetscFunctionReturn(0);
3587 }
3588 
3589 #undef __FUNCT__
3590 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3591 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3592 {
3593   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3594   PetscInt       *AJ=a->j,nz=a->nz;
3595   unsigned short *aj=(unsigned short *)AJ;
3596   PetscFunctionBegin;
3597   /* Is this really necessary? */
3598   while (nz--) {
3599     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3600   }
3601   A->ops->setunfactored = PETSC_NULL;
3602   PetscFunctionReturn(0);
3603 }
3604 
3605 
3606