xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 0fa040f91fc0ebe4f74b7f63f2221f0edf4da5a7)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode    ierr;
18   PetscInt          i,nz;
19   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20   const MatScalar   *aa=a->a,*v;
21   PetscScalar       s1,*x;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode    ierr;
64   PetscInt          i,nz,idx,idt,oidx;
65   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66   const MatScalar   *aa=a->a,*v;
67   PetscScalar       s1,s2,x1,x2,*x;
68   const PetscScalar *b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode    ierr;
123   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt          nz,idx,idt,j,i,oidx;
125   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126   const MatScalar   *aa=a->a,*v;
127   PetscScalar       s1,s2,x1,x2,*x;
128   const PetscScalar *b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode    ierr;
182   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183   PetscInt          i,nz,idx,idt,oidx;
184   const MatScalar   *aa=a->a,*v;
185   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186   const PetscScalar *b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode    ierr;
244   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt          nz,idx,idt,j,i,oidx;
246   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247   const MatScalar   *aa=a->a,*v;
248   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249   const PetscScalar *b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode    ierr;
306   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307   PetscInt          i,nz,idx,idt,oidx;
308   const MatScalar   *aa=a->a,*v;
309   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310   const PetscScalar *b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode    ierr;
371   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt          nz,idx,idt,j,i,oidx;
373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374   const MatScalar   *aa=a->a,*v;
375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376   const PetscScalar *b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode    ierr;
436   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437   PetscInt          i,nz,idx,idt,oidx;
438   const MatScalar   *aa=a->a,*v;
439   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440   const PetscScalar *b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   const MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509   const PetscScalar    *b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode    ierr;
573   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574   PetscInt          i,nz,idx,idt,oidx;
575   const MatScalar   *aa=a->a,*v;
576   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577   const PetscScalar *b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode    ierr;
647   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt          nz,idx,idt,j,i,oidx;
649   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650   const MatScalar   *aa=a->a,*v;
651   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652   const PetscScalar *b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode    ierr;
721   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722   PetscInt          i,nz,idx,idt,oidx;
723   const MatScalar   *aa=a->a,*v;
724   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725   const PetscScalar *b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode    ierr;
797   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt          nz,idx,idt,j,i,oidx;
799   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800   const MatScalar   *aa=a->a,*v;
801   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802   const PetscScalar *b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873   IS                iscol=a->col,isrow=a->row;
874   PetscErrorCode    ierr;
875   const PetscInt    *r,*c,*rout,*cout;
876   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877   PetscInt          i,nz;
878   const MatScalar   *aa=a->a,*v;
879   PetscScalar       s1,*x,*t;
880   const PetscScalar *b;
881 
882   PetscFunctionBegin;
883   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
884   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
885   t  = a->solve_work;
886 
887   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
888   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
889 
890   /* copy the b into temp work space according to permutation */
891   for (i=0; i<n; i++) {
892     t[i] = b[c[i]];
893   }
894 
895   /* forward solve the U^T */
896   for (i=0; i<n; i++) {
897 
898     v     = aa + diag[i];
899     /* multiply by the inverse of the block diagonal */
900     s1    = (*v++)*t[i];
901     vi    = aj + diag[i] + 1;
902     nz    = ai[i+1] - diag[i] - 1;
903     while (nz--) {
904       t[*vi++]  -= (*v++)*s1;
905     }
906     t[i]   = s1;
907   }
908   /* backward solve the L^T */
909   for (i=n-1; i>=0; i--){
910     v    = aa + diag[i] - 1;
911     vi   = aj + diag[i] - 1;
912     nz   = diag[i] - ai[i];
913     s1   = t[i];
914     while (nz--) {
915       t[*vi--]   -=  (*v--)*s1;
916     }
917   }
918 
919   /* copy t into x according to permutation */
920   for (i=0; i<n; i++) {
921     x[r[i]]   = t[i];
922   }
923 
924   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
925   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
926   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
934 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935 {
936   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937   IS                iscol=a->col,isrow=a->row;
938   PetscErrorCode    ierr;
939   const PetscInt    *r,*c,*rout,*cout;
940   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942   const MatScalar   *aa=a->a,*v;
943   PetscScalar       s1,s2,x1,x2,*x,*t;
944   const PetscScalar *b;
945 
946   PetscFunctionBegin;
947   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
948   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
949   t  = a->solve_work;
950 
951   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
952   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
953 
954   /* copy the b into temp work space according to permutation */
955   ii = 0;
956   for (i=0; i<n; i++) {
957     ic      = 2*c[i];
958     t[ii]   = b[ic];
959     t[ii+1] = b[ic+1];
960     ii += 2;
961   }
962 
963   /* forward solve the U^T */
964   idx = 0;
965   for (i=0; i<n; i++) {
966 
967     v     = aa + 4*diag[i];
968     /* multiply by the inverse of the block diagonal */
969     x1    = t[idx];   x2 = t[1+idx];
970     s1 = v[0]*x1  +  v[1]*x2;
971     s2 = v[2]*x1  +  v[3]*x2;
972     v += 4;
973 
974     vi    = aj + diag[i] + 1;
975     nz    = ai[i+1] - diag[i] - 1;
976     while (nz--) {
977       oidx = 2*(*vi++);
978       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980       v  += 4;
981     }
982     t[idx]   = s1;t[1+idx] = s2;
983     idx += 2;
984   }
985   /* backward solve the L^T */
986   for (i=n-1; i>=0; i--){
987     v    = aa + 4*diag[i] - 4;
988     vi   = aj + diag[i] - 1;
989     nz   = diag[i] - ai[i];
990     idt  = 2*i;
991     s1 = t[idt];  s2 = t[1+idt];
992     while (nz--) {
993       idx   = 2*(*vi--);
994       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996       v -= 4;
997     }
998   }
999 
1000   /* copy t into x according to permutation */
1001   ii = 0;
1002   for (i=0; i<n; i++) {
1003     ir      = 2*r[i];
1004     x[ir]   = t[ii];
1005     x[ir+1] = t[ii+1];
1006     ii += 2;
1007   }
1008 
1009   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1010   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1012   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1019 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020 {
1021   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1022   PetscErrorCode    ierr;
1023   IS                iscol=a->col,isrow=a->row;
1024   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025   const PetscInt    *r,*c,*rout,*cout;
1026   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028   const MatScalar   *aa=a->a,*v;
1029   PetscScalar       s1,s2,x1,x2,*x,*t;
1030   const PetscScalar *b;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1035   t = a->solve_work;
1036 
1037   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1038   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1039 
1040   /* copy b into temp work space according to permutation */
1041   for(i=0;i<n;i++){
1042     ii = bs*i; ic = bs*c[i];
1043     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044   }
1045 
1046   /* forward solve the U^T */
1047   idx = 0;
1048   for (i=0; i<n; i++) {
1049     v     = aa + bs2*diag[i];
1050     /* multiply by the inverse of the block diagonal */
1051     x1 = t[idx];   x2 = t[1+idx];
1052     s1 = v[0]*x1  +  v[1]*x2;
1053     s2 = v[2]*x1  +  v[3]*x2;
1054     v -= bs2;
1055 
1056     vi    = aj + diag[i] - 1;
1057     nz    = diag[i] - diag[i+1] - 1;
1058     for(j=0;j>-nz;j--){
1059       oidx = bs*vi[j];
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1061       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1062       v  -= bs2;
1063     }
1064     t[idx]   = s1;t[1+idx] = s2;
1065     idx += bs;
1066   }
1067   /* backward solve the L^T */
1068   for (i=n-1; i>=0; i--){
1069     v    = aa + bs2*ai[i];
1070     vi   = aj + ai[i];
1071     nz   = ai[i+1] - ai[i];
1072     idt  = bs*i;
1073     s1   = t[idt];  s2 = t[1+idt];
1074     for(j=0;j<nz;j++){
1075       idx   = bs*vi[j];
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1077       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1078       v += bs2;
1079     }
1080   }
1081 
1082   /* copy t into x according to permutation */
1083   for(i=0;i<n;i++){
1084     ii = bs*i;  ir = bs*r[i];
1085     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1086   }
1087 
1088   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1089   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1092   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1098 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101   IS                iscol=a->col,isrow=a->row;
1102   PetscErrorCode    ierr;
1103   const PetscInt    *r,*c,*rout,*cout;
1104   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106   const MatScalar   *aa=a->a,*v;
1107   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108   const PetscScalar *b;
1109 
1110   PetscFunctionBegin;
1111   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113   t  = a->solve_work;
1114 
1115   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1116   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1117 
1118   /* copy the b into temp work space according to permutation */
1119   ii = 0;
1120   for (i=0; i<n; i++) {
1121     ic      = 3*c[i];
1122     t[ii]   = b[ic];
1123     t[ii+1] = b[ic+1];
1124     t[ii+2] = b[ic+2];
1125     ii += 3;
1126   }
1127 
1128   /* forward solve the U^T */
1129   idx = 0;
1130   for (i=0; i<n; i++) {
1131 
1132     v     = aa + 9*diag[i];
1133     /* multiply by the inverse of the block diagonal */
1134     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138     v += 9;
1139 
1140     vi    = aj + diag[i] + 1;
1141     nz    = ai[i+1] - diag[i] - 1;
1142     while (nz--) {
1143       oidx = 3*(*vi++);
1144       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147       v  += 9;
1148     }
1149     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150     idx += 3;
1151   }
1152   /* backward solve the L^T */
1153   for (i=n-1; i>=0; i--){
1154     v    = aa + 9*diag[i] - 9;
1155     vi   = aj + diag[i] - 1;
1156     nz   = diag[i] - ai[i];
1157     idt  = 3*i;
1158     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159     while (nz--) {
1160       idx   = 3*(*vi--);
1161       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164       v -= 9;
1165     }
1166   }
1167 
1168   /* copy t into x according to permutation */
1169   ii = 0;
1170   for (i=0; i<n; i++) {
1171     ir      = 3*r[i];
1172     x[ir]   = t[ii];
1173     x[ir+1] = t[ii+1];
1174     x[ir+2] = t[ii+2];
1175     ii += 3;
1176   }
1177 
1178   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1179   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1188 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189 {
1190   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1191   PetscErrorCode    ierr;
1192   IS                iscol=a->col,isrow=a->row;
1193   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194   const PetscInt    *r,*c,*rout,*cout;
1195   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197   const MatScalar   *aa=a->a,*v;
1198   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199   const PetscScalar *b;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* copy b into temp work space according to permutation */
1210   for(i=0;i<n;i++){
1211     ii = bs*i; ic = bs*c[i];
1212     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213   }
1214 
1215   /* forward solve the U^T */
1216   idx = 0;
1217   for (i=0; i<n; i++) {
1218     v     = aa + bs2*diag[i];
1219     /* multiply by the inverse of the block diagonal */
1220     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1221     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1222     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1223     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1224     v -= bs2;
1225 
1226     vi    = aj + diag[i] - 1;
1227     nz    = diag[i] - diag[i+1] - 1;
1228     for(j=0;j>-nz;j--){
1229       oidx = bs*vi[j];
1230       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1231       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1232       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v  -= bs2;
1234     }
1235     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1236     idx += bs;
1237   }
1238   /* backward solve the L^T */
1239   for (i=n-1; i>=0; i--){
1240     v    = aa + bs2*ai[i];
1241     vi   = aj + ai[i];
1242     nz   = ai[i+1] - ai[i];
1243     idt  = bs*i;
1244     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1245     for(j=0;j<nz;j++){
1246       idx   = bs*vi[j];
1247       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1248       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1249       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250       v += bs2;
1251     }
1252   }
1253 
1254   /* copy t into x according to permutation */
1255   for(i=0;i<n;i++){
1256     ii = bs*i;  ir = bs*r[i];
1257     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1270 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273   IS                iscol=a->col,isrow=a->row;
1274   PetscErrorCode    ierr;
1275   const PetscInt    *r,*c,*rout,*cout;
1276   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278   const MatScalar   *aa=a->a,*v;
1279   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280   const PetscScalar *b;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1285   t  = a->solve_work;
1286 
1287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1289 
1290   /* copy the b into temp work space according to permutation */
1291   ii = 0;
1292   for (i=0; i<n; i++) {
1293     ic      = 4*c[i];
1294     t[ii]   = b[ic];
1295     t[ii+1] = b[ic+1];
1296     t[ii+2] = b[ic+2];
1297     t[ii+3] = b[ic+3];
1298     ii += 4;
1299   }
1300 
1301   /* forward solve the U^T */
1302   idx = 0;
1303   for (i=0; i<n; i++) {
1304 
1305     v     = aa + 16*diag[i];
1306     /* multiply by the inverse of the block diagonal */
1307     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312     v += 16;
1313 
1314     vi    = aj + diag[i] + 1;
1315     nz    = ai[i+1] - diag[i] - 1;
1316     while (nz--) {
1317       oidx = 4*(*vi++);
1318       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322       v  += 16;
1323     }
1324     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325     idx += 4;
1326   }
1327   /* backward solve the L^T */
1328   for (i=n-1; i>=0; i--){
1329     v    = aa + 16*diag[i] - 16;
1330     vi   = aj + diag[i] - 1;
1331     nz   = diag[i] - ai[i];
1332     idt  = 4*i;
1333     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334     while (nz--) {
1335       idx   = 4*(*vi--);
1336       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340       v -= 16;
1341     }
1342   }
1343 
1344   /* copy t into x according to permutation */
1345   ii = 0;
1346   for (i=0; i<n; i++) {
1347     ir      = 4*r[i];
1348     x[ir]   = t[ii];
1349     x[ir+1] = t[ii+1];
1350     x[ir+2] = t[ii+2];
1351     x[ir+3] = t[ii+3];
1352     ii += 4;
1353   }
1354 
1355   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1356   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1359   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 #undef __FUNCT__
1364 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1365 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366 {
1367   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1368   PetscErrorCode    ierr;
1369   IS                iscol=a->col,isrow=a->row;
1370   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371   const PetscInt    *r,*c,*rout,*cout;
1372   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376   const PetscScalar *b;
1377 
1378   PetscFunctionBegin;
1379   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1381   t = a->solve_work;
1382 
1383   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1384   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1385 
1386   /* copy b into temp work space according to permutation */
1387   for(i=0;i<n;i++){
1388     ii = bs*i; ic = bs*c[i];
1389     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390   }
1391 
1392   /* forward solve the U^T */
1393   idx = 0;
1394   for (i=0; i<n; i++) {
1395     v     = aa + bs2*diag[i];
1396     /* multiply by the inverse of the block diagonal */
1397     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1398     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1399     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1400     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1401     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402     v -= bs2;
1403 
1404     vi    = aj + diag[i] - 1;
1405     nz    = diag[i] - diag[i+1] - 1;
1406     for(j=0;j>-nz;j--){
1407       oidx = bs*vi[j];
1408       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1409       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1410       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412       v  -= bs2;
1413     }
1414     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1415     idx += bs;
1416   }
1417   /* backward solve the L^T */
1418   for (i=n-1; i>=0; i--){
1419     v    = aa + bs2*ai[i];
1420     vi   = aj + ai[i];
1421     nz   = ai[i+1] - ai[i];
1422     idt  = bs*i;
1423     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1424     for(j=0;j<nz;j++){
1425       idx   = bs*vi[j];
1426       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1427       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1428       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1429       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1430       v += bs2;
1431     }
1432   }
1433 
1434   /* copy t into x according to permutation */
1435   for(i=0;i<n;i++){
1436     ii = bs*i;  ir = bs*r[i];
1437     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1450 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453   IS                iscol=a->col,isrow=a->row;
1454   PetscErrorCode    ierr;
1455   const PetscInt    *r,*c,*rout,*cout;
1456   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458   const MatScalar   *aa=a->a,*v;
1459   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460   const PetscScalar *b;
1461 
1462   PetscFunctionBegin;
1463   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1465   t  = a->solve_work;
1466 
1467   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1468   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1469 
1470   /* copy the b into temp work space according to permutation */
1471   ii = 0;
1472   for (i=0; i<n; i++) {
1473     ic      = 5*c[i];
1474     t[ii]   = b[ic];
1475     t[ii+1] = b[ic+1];
1476     t[ii+2] = b[ic+2];
1477     t[ii+3] = b[ic+3];
1478     t[ii+4] = b[ic+4];
1479     ii += 5;
1480   }
1481 
1482   /* forward solve the U^T */
1483   idx = 0;
1484   for (i=0; i<n; i++) {
1485 
1486     v     = aa + 25*diag[i];
1487     /* multiply by the inverse of the block diagonal */
1488     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494     v += 25;
1495 
1496     vi    = aj + diag[i] + 1;
1497     nz    = ai[i+1] - diag[i] - 1;
1498     while (nz--) {
1499       oidx = 5*(*vi++);
1500       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505       v  += 25;
1506     }
1507     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508     idx += 5;
1509   }
1510   /* backward solve the L^T */
1511   for (i=n-1; i>=0; i--){
1512     v    = aa + 25*diag[i] - 25;
1513     vi   = aj + diag[i] - 1;
1514     nz   = diag[i] - ai[i];
1515     idt  = 5*i;
1516     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517     while (nz--) {
1518       idx   = 5*(*vi--);
1519       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524       v -= 25;
1525     }
1526   }
1527 
1528   /* copy t into x according to permutation */
1529   ii = 0;
1530   for (i=0; i<n; i++) {
1531     ir      = 5*r[i];
1532     x[ir]   = t[ii];
1533     x[ir+1] = t[ii+1];
1534     x[ir+2] = t[ii+2];
1535     x[ir+3] = t[ii+3];
1536     x[ir+4] = t[ii+4];
1537     ii += 5;
1538   }
1539 
1540   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1541   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1550 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553   PetscErrorCode    ierr;
1554   IS                iscol=a->col,isrow=a->row;
1555   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556   const PetscInt    *r,*c,*rout,*cout;
1557   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559   const MatScalar   *aa=a->a,*v;
1560   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561   const PetscScalar *b;
1562 
1563   PetscFunctionBegin;
1564   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1566   t = a->solve_work;
1567 
1568   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1569   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1570 
1571   /* copy b into temp work space according to permutation */
1572   for(i=0;i<n;i++){
1573     ii = bs*i; ic = bs*c[i];
1574     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575     t[ii+4] = b[ic+4];
1576   }
1577 
1578   /* forward solve the U^T */
1579   idx = 0;
1580   for (i=0; i<n; i++) {
1581     v     = aa + bs2*diag[i];
1582     /* multiply by the inverse of the block diagonal */
1583     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1585     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1586     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589     v -= bs2;
1590 
1591     vi    = aj + diag[i] - 1;
1592     nz    = diag[i] - diag[i+1] - 1;
1593     for(j=0;j>-nz;j--){
1594       oidx = bs*vi[j];
1595       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1596       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1597       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600       v  -= bs2;
1601     }
1602     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1603     idx += bs;
1604   }
1605   /* backward solve the L^T */
1606   for (i=n-1; i>=0; i--){
1607     v    = aa + bs2*ai[i];
1608     vi   = aj + ai[i];
1609     nz   = ai[i+1] - ai[i];
1610     idt  = bs*i;
1611     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1612     for(j=0;j<nz;j++){
1613       idx   = bs*vi[j];
1614       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1615       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1616       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619       v += bs2;
1620     }
1621   }
1622 
1623   /* copy t into x according to permutation */
1624   for(i=0;i<n;i++){
1625     ii = bs*i;  ir = bs*r[i];
1626     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1627     x[ir+4] = t[ii+4];
1628   }
1629 
1630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1634   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1640 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641 {
1642   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643   IS                iscol=a->col,isrow=a->row;
1644   PetscErrorCode    ierr;
1645   const PetscInt    *r,*c,*rout,*cout;
1646   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648   const MatScalar   *aa=a->a,*v;
1649   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650   const PetscScalar *b;
1651 
1652   PetscFunctionBegin;
1653   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655   t  = a->solve_work;
1656 
1657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1659 
1660   /* copy the b into temp work space according to permutation */
1661   ii = 0;
1662   for (i=0; i<n; i++) {
1663     ic      = 6*c[i];
1664     t[ii]   = b[ic];
1665     t[ii+1] = b[ic+1];
1666     t[ii+2] = b[ic+2];
1667     t[ii+3] = b[ic+3];
1668     t[ii+4] = b[ic+4];
1669     t[ii+5] = b[ic+5];
1670     ii += 6;
1671   }
1672 
1673   /* forward solve the U^T */
1674   idx = 0;
1675   for (i=0; i<n; i++) {
1676 
1677     v     = aa + 36*diag[i];
1678     /* multiply by the inverse of the block diagonal */
1679     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680     x6    = t[5+idx];
1681     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687     v += 36;
1688 
1689     vi    = aj + diag[i] + 1;
1690     nz    = ai[i+1] - diag[i] - 1;
1691     while (nz--) {
1692       oidx = 6*(*vi++);
1693       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699       v  += 36;
1700     }
1701     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702     t[5+idx] = s6;
1703     idx += 6;
1704   }
1705   /* backward solve the L^T */
1706   for (i=n-1; i>=0; i--){
1707     v    = aa + 36*diag[i] - 36;
1708     vi   = aj + diag[i] - 1;
1709     nz   = diag[i] - ai[i];
1710     idt  = 6*i;
1711     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712     s6 = t[5+idt];
1713     while (nz--) {
1714       idx   = 6*(*vi--);
1715       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721       v -= 36;
1722     }
1723   }
1724 
1725   /* copy t into x according to permutation */
1726   ii = 0;
1727   for (i=0; i<n; i++) {
1728     ir      = 6*r[i];
1729     x[ir]   = t[ii];
1730     x[ir+1] = t[ii+1];
1731     x[ir+2] = t[ii+2];
1732     x[ir+3] = t[ii+3];
1733     x[ir+4] = t[ii+4];
1734     x[ir+5] = t[ii+5];
1735     ii += 6;
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749 {
1750   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751   PetscErrorCode    ierr;
1752   IS                iscol=a->col,isrow=a->row;
1753   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754   const PetscInt    *r,*c,*rout,*cout;
1755   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759   const PetscScalar *b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t = a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768 
1769   /* copy b into temp work space according to permutation */
1770   for(i=0;i<n;i++){
1771     ii = bs*i; ic = bs*c[i];
1772     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1774   }
1775 
1776   /* forward solve the U^T */
1777   idx = 0;
1778   for (i=0; i<n; i++) {
1779     v     = aa + bs2*diag[i];
1780     /* multiply by the inverse of the block diagonal */
1781     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782     x6    = t[5+idx];
1783     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1784     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1785     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789     v -= bs2;
1790 
1791     vi    = aj + diag[i] - 1;
1792     nz    = diag[i] - diag[i+1] - 1;
1793     for(j=0;j>-nz;j--){
1794       oidx = bs*vi[j];
1795       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1796       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1797       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801       v  -= bs2;
1802     }
1803     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1804     t[5+idx] = s6;
1805     idx += bs;
1806   }
1807   /* backward solve the L^T */
1808   for (i=n-1; i>=0; i--){
1809     v    = aa + bs2*ai[i];
1810     vi   = aj + ai[i];
1811     nz   = ai[i+1] - ai[i];
1812     idt  = bs*i;
1813     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1814     s6   = t[5+idt];
1815    for(j=0;j<nz;j++){
1816       idx   = bs*vi[j];
1817       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1818       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1819       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823       v += bs2;
1824     }
1825   }
1826 
1827   /* copy t into x according to permutation */
1828   for(i=0;i<n;i++){
1829     ii = bs*i;  ir = bs*r[i];
1830     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1831     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1832   }
1833 
1834   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1838   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1839   PetscFunctionReturn(0);
1840 }
1841 
1842 #undef __FUNCT__
1843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845 {
1846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847   IS                iscol=a->col,isrow=a->row;
1848   PetscErrorCode    ierr;
1849   const PetscInt    *r,*c,*rout,*cout;
1850   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854   const PetscScalar *b;
1855 
1856   PetscFunctionBegin;
1857   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859   t  = a->solve_work;
1860 
1861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1863 
1864   /* copy the b into temp work space according to permutation */
1865   ii = 0;
1866   for (i=0; i<n; i++) {
1867     ic      = 7*c[i];
1868     t[ii]   = b[ic];
1869     t[ii+1] = b[ic+1];
1870     t[ii+2] = b[ic+2];
1871     t[ii+3] = b[ic+3];
1872     t[ii+4] = b[ic+4];
1873     t[ii+5] = b[ic+5];
1874     t[ii+6] = b[ic+6];
1875     ii += 7;
1876   }
1877 
1878   /* forward solve the U^T */
1879   idx = 0;
1880   for (i=0; i<n; i++) {
1881 
1882     v     = aa + 49*diag[i];
1883     /* multiply by the inverse of the block diagonal */
1884     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885     x6    = t[5+idx]; x7 = t[6+idx];
1886     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893     v += 49;
1894 
1895     vi    = aj + diag[i] + 1;
1896     nz    = ai[i+1] - diag[i] - 1;
1897     while (nz--) {
1898       oidx = 7*(*vi++);
1899       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906       v  += 49;
1907     }
1908     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909     t[5+idx] = s6;t[6+idx] = s7;
1910     idx += 7;
1911   }
1912   /* backward solve the L^T */
1913   for (i=n-1; i>=0; i--){
1914     v    = aa + 49*diag[i] - 49;
1915     vi   = aj + diag[i] - 1;
1916     nz   = diag[i] - ai[i];
1917     idt  = 7*i;
1918     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919     s6 = t[5+idt];s7 = t[6+idt];
1920     while (nz--) {
1921       idx   = 7*(*vi--);
1922       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929       v -= 49;
1930     }
1931   }
1932 
1933   /* copy t into x according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ir      = 7*r[i];
1937     x[ir]   = t[ii];
1938     x[ir+1] = t[ii+1];
1939     x[ir+2] = t[ii+2];
1940     x[ir+3] = t[ii+3];
1941     x[ir+4] = t[ii+4];
1942     x[ir+5] = t[ii+5];
1943     x[ir+6] = t[ii+6];
1944     ii += 7;
1945   }
1946 
1947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 #undef __FUNCT__
1955 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1956 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957 {
1958   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1959   PetscErrorCode    ierr;
1960   IS                iscol=a->col,isrow=a->row;
1961   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962   const PetscInt    *r,*c,*rout,*cout;
1963   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965   const MatScalar   *aa=a->a,*v;
1966   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967   const PetscScalar *b;
1968 
1969   PetscFunctionBegin;
1970   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1971   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1972   t = a->solve_work;
1973 
1974   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1975   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1976 
1977   /* copy b into temp work space according to permutation */
1978   for(i=0;i<n;i++){
1979     ii = bs*i; ic = bs*c[i];
1980     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1982   }
1983 
1984   /* forward solve the U^T */
1985   idx = 0;
1986   for (i=0; i<n; i++) {
1987     v     = aa + bs2*diag[i];
1988     /* multiply by the inverse of the block diagonal */
1989     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990     x6    = t[5+idx]; x7 = t[6+idx];
1991     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1992     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998     v -= bs2;
1999 
2000     vi    = aj + diag[i] - 1;
2001     nz    = diag[i] - diag[i+1] - 1;
2002     for(j=0;j>-nz;j--){
2003       oidx = bs*vi[j];
2004       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2005       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011       v  -= bs2;
2012     }
2013     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2014     t[5+idx] = s6;  t[6+idx] = s7;
2015     idx += bs;
2016   }
2017   /* backward solve the L^T */
2018   for (i=n-1; i>=0; i--){
2019     v    = aa + bs2*ai[i];
2020     vi   = aj + ai[i];
2021     nz   = ai[i+1] - ai[i];
2022     idt  = bs*i;
2023     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2024     s6   = t[5+idt];  s7 = t[6+idt];
2025    for(j=0;j<nz;j++){
2026       idx   = bs*vi[j];
2027       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2028       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034       v += bs2;
2035     }
2036   }
2037 
2038   /* copy t into x according to permutation */
2039   for(i=0;i<n;i++){
2040     ii = bs*i;  ir = bs*r[i];
2041     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2042     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2043   }
2044 
2045   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2046   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2049   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /* ----------------------------------------------------------- */
2054 #undef __FUNCT__
2055 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2056 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057 {
2058   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2059   IS                iscol=a->col,isrow=a->row;
2060   PetscErrorCode    ierr;
2061   const PetscInt    *r,*c,*rout,*cout;
2062   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063   PetscInt          i,nz;
2064   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065   const MatScalar   *aa=a->a,*v;
2066   PetscScalar       *x,*s,*t,*ls;
2067   const PetscScalar *b;
2068 
2069   PetscFunctionBegin;
2070   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2071   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2072   t  = a->solve_work;
2073 
2074   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2075   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2076 
2077   /* forward solve the lower triangular */
2078   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2079   for (i=1; i<n; i++) {
2080     v   = aa + bs2*ai[i];
2081     vi  = aj + ai[i];
2082     nz  = a->diag[i] - ai[i];
2083     s = t + bs*i;
2084     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2085     while (nz--) {
2086       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087       v += bs2;
2088     }
2089   }
2090   /* backward solve the upper triangular */
2091   ls = a->solve_work + A->cmap->n;
2092   for (i=n-1; i>=0; i--){
2093     v   = aa + bs2*(a->diag[i] + 1);
2094     vi  = aj + a->diag[i] + 1;
2095     nz  = ai[i+1] - a->diag[i] - 1;
2096     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2097     while (nz--) {
2098       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099       v += bs2;
2100     }
2101     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2103   }
2104 
2105   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2106   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2107   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2109   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112 
2113 /* ----------------------------------------------------------- */
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2116 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2119   IS                iscol=a->col,isrow=a->row;
2120   PetscErrorCode    ierr;
2121   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122   PetscInt          i,nz,j;
2123   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124   const MatScalar   *aa=a->a,*v;
2125   PetscScalar       *x,*t,*ls;
2126   const PetscScalar *b;
2127   PetscFunctionBegin;
2128   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2130   t    = a->solve_work;
2131 
2132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2134 
2135   /* copy the b into temp work space according to permutation */
2136   for (i=0; i<n; i++) {
2137     for (j=0; j<bs; j++) {
2138       t[i*bs+j] = b[c[i]*bs+j];
2139     }
2140   }
2141 
2142 
2143   /* forward solve the upper triangular transpose */
2144   ls = a->solve_work + A->cmap->n;
2145   for (i=0; i<n; i++){
2146     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2147     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148     v   = aa + bs2*(a->diag[i] + 1);
2149     vi  = aj + a->diag[i] + 1;
2150     nz  = ai[i+1] - a->diag[i] - 1;
2151     while (nz--) {
2152       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153       v += bs2;
2154     }
2155   }
2156 
2157   /* backward solve the lower triangular transpose */
2158   for (i=n-1; i>=0; i--) {
2159     v   = aa + bs2*ai[i];
2160     vi  = aj + ai[i];
2161     nz  = a->diag[i] - ai[i];
2162     while (nz--) {
2163       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164       v += bs2;
2165     }
2166   }
2167 
2168   /* copy t into x according to permutation */
2169   for (i=0; i<n; i++) {
2170     for (j=0; j<bs; j++) {
2171       x[bs*r[i]+j]   = t[bs*i+j];
2172     }
2173   }
2174 
2175   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2176   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2178   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2179   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout;
2191   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192   PetscInt          i,j,nz;
2193   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2194   const MatScalar   *aa=a->a,*v;
2195   PetscScalar       *x,*t,*ls;
2196   const PetscScalar *b;
2197 
2198   PetscFunctionBegin;
2199   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2200   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2201   t    = a->solve_work;
2202 
2203   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2204   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2205 
2206   /* copy the b into temp work space according to permutation */
2207   for (i=0; i<n; i++) {
2208     for (j=0; j<bs; j++) {
2209       t[i*bs+j] = b[c[i]*bs+j];
2210     }
2211   }
2212 
2213 
2214   /* forward solve the upper triangular transpose */
2215   ls = a->solve_work + A->cmap->n;
2216   for (i=0; i<n; i++){
2217     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2218     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219     v   = aa + bs2*(diag[i] - 1);
2220     vi  = aj + diag[i] - 1;
2221     nz  = diag[i] - diag[i+1] - 1;
2222     for(j=0;j>-nz;j--){
2223       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224       v -= bs2;
2225     }
2226   }
2227 
2228   /* backward solve the lower triangular transpose */
2229   for (i=n-1; i>=0; i--) {
2230     v   = aa + bs2*ai[i];
2231     vi  = aj + ai[i];
2232     nz  = ai[i+1] - ai[i];
2233     for(j=0;j<nz;j++){
2234       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235       v += bs2;
2236     }
2237   }
2238 
2239   /* copy t into x according to permutation */
2240   for (i=0; i<n; i++) {
2241     for (j=0; j<bs; j++) {
2242       x[bs*r[i]+j]   = t[bs*i+j];
2243     }
2244   }
2245 
2246   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2247   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2250   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /* bs = 15 for PFLOTRAN */
2255 
2256 #undef __FUNCT__
2257 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering"
2258 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx)
2259 {
2260   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2261   PetscErrorCode    ierr;
2262   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263   PetscInt          i,nz,idx,idt,m;
2264   const MatScalar   *aa=a->a,*v;
2265   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267   PetscScalar       *x;
2268   const PetscScalar *b;
2269 
2270   PetscFunctionBegin;
2271   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2273 
2274   /* forward solve the lower triangular */
2275   idx    = 0;
2276   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2277   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2278   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2279 
2280   for (i=1; i<n; i++) {
2281     v     = aa + bs2*ai[i];
2282     vi    = aj + ai[i];
2283     nz    = ai[i+1] - ai[i];
2284     idt   = bs*i;
2285     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2286     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2287     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2288     for(m=0;m<nz;m++){
2289       idx   = bs*vi[m];
2290       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2291       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2292       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2293 
2294 
2295       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2296       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2297       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2298       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2299       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2300       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2301       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2302       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2303       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2304       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2305       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2306       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2307       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2308       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2309       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2310 
2311       v += bs2;
2312     }
2313     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2314     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2315     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2316 
2317   }
2318   /* backward solve the upper triangular */
2319   for (i=n-1; i>=0; i--){
2320     v    = aa + bs2*(adiag[i+1]+1);
2321     vi   = aj + adiag[i+1]+1;
2322     nz   = adiag[i] - adiag[i+1] - 1;
2323     idt  = bs*i;
2324     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2325     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2326     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2327 
2328     for(m=0;m<nz;m++){
2329       idx   = bs*vi[m];
2330       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2331       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2332       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2333 
2334       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2335       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2336       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2337       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2338       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2339       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2340       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2341       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2342       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2343       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2344       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2345       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2346       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2347       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2348       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2349 
2350       v += bs2;
2351     }
2352 
2353     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2354     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2355     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2356     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2357     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2358     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2359     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2360     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2361     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2362     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2363     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2364     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2365     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2366     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2367     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2368 
2369   }
2370 
2371   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2373   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2374   PetscFunctionReturn(0);
2375 }
2376 
2377 #undef __FUNCT__
2378 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2379 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2380 {
2381   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2382   PetscErrorCode    ierr;
2383   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2384   PetscInt          i,k,nz,kdx,idx,idt,m;
2385   const MatScalar   *aa=a->a,*v;
2386   PetscScalar       s[15];
2387   PetscScalar       *x;
2388   const PetscScalar *b;
2389 
2390   PetscFunctionBegin;
2391   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2392   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2393 
2394   /* forward solve the lower triangular */
2395   idx    = 0;
2396   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2397   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2398   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2399 
2400   for (i=1; i<n; i++) {
2401     v     = aa + bs2*ai[i];
2402     vi    = aj + ai[i];
2403     nz    = ai[i+1] - ai[i];
2404     idt   = bs*i;
2405     s[0]   = b[idt];    s[1]  = b[1+idt];  s[2]  = b[2+idt];  s[3]  = b[3+idt];  s[4]  = b[4+idt];
2406     s[5]   = b[5+idt];  s[6]  = b[6+idt];  s[7]  = b[7+idt];  s[8]  = b[8+idt];  s[9] = b[9+idt];
2407     s[10]  = b[10+idt]; s[11] = b[11+idt]; s[12] = b[12+idt]; s[13] = b[13+idt]; s[14] = b[14+idt];
2408     for(m=0;m<nz;m++){
2409       idx   = bs*vi[m];
2410 
2411       for(k=0;k<15;k++){
2412 	kdx = k + idx;
2413 	s[0]  -= v[0]*x[kdx];
2414 	s[1]  -= v[1]*x[kdx];
2415 	s[2]  -= v[2]*x[kdx];
2416         s[3]  -= v[3]*x[kdx];
2417 	s[4]  -= v[4]*x[kdx];
2418 	s[5]  -= v[5]*x[kdx];
2419 	s[6]  -= v[6]*x[kdx];
2420         s[7]  -= v[7]*x[kdx];
2421 	s[8]  -= v[8]*x[kdx];
2422 	s[9]  -= v[9]*x[kdx];
2423 	s[10] -= v[10]*x[kdx];
2424         s[11] -= v[11]*x[kdx];
2425 	s[12] -= v[12]*x[kdx];
2426 	s[13] -= v[13]*x[kdx];
2427 	s[14] -= v[14]*x[kdx];
2428 	v += 15;
2429       }
2430     }
2431     x[idt]    = s[0];  x[1+idt]  = s[1];  x[2+idt]  = s[2];  x[3+idt]  = s[3];  x[4+idt]  = s[4];
2432     x[5+idt]  = s[5];  x[6+idt]  = s[6];  x[7+idt]  = s[7];  x[8+idt]  = s[8];  x[9+idt]  = s[9];
2433     x[10+idt] = s[10]; x[11+idt] = s[11]; x[12+idt] = s[12]; x[13+idt] = s[13]; x[14+idt] = s[14];
2434 
2435   }
2436   /* backward solve the upper triangular */
2437   for (i=n-1; i>=0; i--){
2438     v    = aa + bs2*(adiag[i+1]+1);
2439     vi   = aj + adiag[i+1]+1;
2440     nz   = adiag[i] - adiag[i+1] - 1;
2441     idt  = bs*i;
2442     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2443     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2444     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2445 
2446     for(m=0;m<nz;m++){
2447       idx   = bs*vi[m];
2448       for(k=0;k<15;k++){
2449 	kdx = k + idx;
2450 	s[0]  -= v[0]*x[kdx];
2451 	s[1]  -= v[1]*x[kdx];
2452 	s[2]  -= v[2]*x[kdx];
2453         s[3]  -= v[3]*x[kdx];
2454 	s[4]  -= v[4]*x[kdx];
2455 	s[5]  -= v[5]*x[kdx];
2456 	s[6]  -= v[6]*x[kdx];
2457         s[7]  -= v[7]*x[kdx];
2458 	s[8]  -= v[8]*x[kdx];
2459 	s[9]  -= v[9]*x[kdx];
2460 	s[10] -= v[10]*x[kdx];
2461         s[11] -= v[11]*x[kdx];
2462 	s[12] -= v[12]*x[kdx];
2463 	s[13] -= v[13]*x[kdx];
2464 	s[14] -= v[14]*x[kdx];
2465 	v += 15;
2466       }
2467     }
2468     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
2469     for(k=0;k<15;k++){
2470       x[idt]    += v[0]*s[k];
2471       x[1+idt]  += v[1]*s[k];
2472       x[2+idt]  += v[2]*s[k];
2473       x[3+idt]  += v[3]*s[k];
2474       x[4+idt]  += v[4]*s[k];
2475       x[5+idt]  += v[5]*s[k];
2476       x[6+idt]  += v[6]*s[k];
2477       x[7+idt]  += v[7]*s[k];
2478       x[8+idt]  += v[8]*s[k];
2479       x[9+idt]  += v[9]*s[k];
2480       x[10+idt] += v[10]*s[k];
2481       x[11+idt] += v[11]*s[k];
2482       x[12+idt] += v[12]*s[k];
2483       x[13+idt] += v[13]*s[k];
2484       x[14+idt] += v[14]*s[k];
2485       v += 15;
2486     }
2487   }
2488   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2490   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2491   PetscFunctionReturn(0);
2492 }
2493 
2494 
2495 #undef __FUNCT__
2496 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2497 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2498 {
2499   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2500   IS                iscol=a->col,isrow=a->row;
2501   PetscErrorCode    ierr;
2502   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2503   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2504   PetscInt          i,nz,idx,idt,idc;
2505   const MatScalar   *aa=a->a,*v;
2506   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2507   const PetscScalar *b;
2508 
2509   PetscFunctionBegin;
2510   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2511   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2512   t  = a->solve_work;
2513 
2514   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2515   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2516 
2517   /* forward solve the lower triangular */
2518   idx    = 7*(*r++);
2519   t[0] = b[idx];   t[1] = b[1+idx];
2520   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2521   t[5] = b[5+idx]; t[6] = b[6+idx];
2522 
2523   for (i=1; i<n; i++) {
2524     v     = aa + 49*ai[i];
2525     vi    = aj + ai[i];
2526     nz    = diag[i] - ai[i];
2527     idx   = 7*(*r++);
2528     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2529     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2530     while (nz--) {
2531       idx   = 7*(*vi++);
2532       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2533       x4    = t[3+idx];x5 = t[4+idx];
2534       x6    = t[5+idx];x7 = t[6+idx];
2535       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2536       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2537       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2538       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2539       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2540       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2541       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2542       v += 49;
2543     }
2544     idx = 7*i;
2545     t[idx]   = s1;t[1+idx] = s2;
2546     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2547     t[5+idx] = s6;t[6+idx] = s7;
2548   }
2549   /* backward solve the upper triangular */
2550   for (i=n-1; i>=0; i--){
2551     v    = aa + 49*diag[i] + 49;
2552     vi   = aj + diag[i] + 1;
2553     nz   = ai[i+1] - diag[i] - 1;
2554     idt  = 7*i;
2555     s1 = t[idt];  s2 = t[1+idt];
2556     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2557     s6 = t[5+idt];s7 = t[6+idt];
2558     while (nz--) {
2559       idx   = 7*(*vi++);
2560       x1    = t[idx];   x2 = t[1+idx];
2561       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2562       x6    = t[5+idx]; x7 = t[6+idx];
2563       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2564       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2565       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2566       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2567       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2568       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2569       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2570       v += 49;
2571     }
2572     idc = 7*(*c--);
2573     v   = aa + 49*diag[i];
2574     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2575                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2576     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2577                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2578     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2579                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2580     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2581                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2582     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2583                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2584     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2585                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2586     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2587                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2588   }
2589 
2590   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2591   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2592   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2593   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2594   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2595   PetscFunctionReturn(0);
2596 }
2597 
2598 #undef __FUNCT__
2599 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2600 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2601 {
2602   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2603   IS                iscol=a->col,isrow=a->row;
2604   PetscErrorCode    ierr;
2605   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2606   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2607   PetscInt          i,nz,idx,idt,idc,m;
2608   const MatScalar   *aa=a->a,*v;
2609   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2610   const PetscScalar *b;
2611 
2612   PetscFunctionBegin;
2613   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2614   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2615   t  = a->solve_work;
2616 
2617   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2618   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2619 
2620   /* forward solve the lower triangular */
2621   idx    = 7*r[0];
2622   t[0] = b[idx];   t[1] = b[1+idx];
2623   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2624   t[5] = b[5+idx]; t[6] = b[6+idx];
2625 
2626   for (i=1; i<n; i++) {
2627     v     = aa + 49*ai[i];
2628     vi    = aj + ai[i];
2629     nz    = ai[i+1] - ai[i];
2630     idx   = 7*r[i];
2631     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2632     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2633     for(m=0;m<nz;m++){
2634       idx   = 7*vi[m];
2635       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2636       x4    = t[3+idx];x5 = t[4+idx];
2637       x6    = t[5+idx];x7 = t[6+idx];
2638       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2639       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2640       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2641       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2642       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2643       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2644       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2645       v += 49;
2646     }
2647     idx = 7*i;
2648     t[idx]   = s1;t[1+idx] = s2;
2649     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2650     t[5+idx] = s6;t[6+idx] = s7;
2651   }
2652   /* backward solve the upper triangular */
2653   for (i=n-1; i>=0; i--){
2654     v    = aa + 49*(adiag[i+1]+1);
2655     vi   = aj + adiag[i+1]+1;
2656     nz   = adiag[i] - adiag[i+1] - 1;
2657     idt  = 7*i;
2658     s1 = t[idt];  s2 = t[1+idt];
2659     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2660     s6 = t[5+idt];s7 = t[6+idt];
2661     for(m=0;m<nz;m++){
2662       idx   = 7*vi[m];
2663       x1    = t[idx];   x2 = t[1+idx];
2664       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2665       x6    = t[5+idx]; x7 = t[6+idx];
2666       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2667       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2668       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2669       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2670       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2671       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2672       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2673       v += 49;
2674     }
2675     idc = 7*c[i];
2676     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2677                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2678     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2679                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2680     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2681                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2682     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2683                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2684     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2685                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2686     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2687                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2688     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2689                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2690   }
2691 
2692   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2693   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2694   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2695   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2696   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2697   PetscFunctionReturn(0);
2698 }
2699 
2700 #undef __FUNCT__
2701 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2702 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2703 {
2704   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2705   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2706   PetscErrorCode    ierr;
2707   PetscInt          i,nz,idx,idt,jdx;
2708   const MatScalar   *aa=a->a,*v;
2709   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2710   const PetscScalar *b;
2711 
2712   PetscFunctionBegin;
2713   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2714   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2715   /* forward solve the lower triangular */
2716   idx    = 0;
2717   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2718   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2719   x[6] = b[6+idx];
2720   for (i=1; i<n; i++) {
2721     v     =  aa + 49*ai[i];
2722     vi    =  aj + ai[i];
2723     nz    =  diag[i] - ai[i];
2724     idx   =  7*i;
2725     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2726     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2727     s7  =  b[6+idx];
2728     while (nz--) {
2729       jdx   = 7*(*vi++);
2730       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2731       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2732       x7    = x[6+jdx];
2733       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2734       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2735       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2736       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2737       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2738       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2739       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2740       v += 49;
2741      }
2742     x[idx]   = s1;
2743     x[1+idx] = s2;
2744     x[2+idx] = s3;
2745     x[3+idx] = s4;
2746     x[4+idx] = s5;
2747     x[5+idx] = s6;
2748     x[6+idx] = s7;
2749   }
2750   /* backward solve the upper triangular */
2751   for (i=n-1; i>=0; i--){
2752     v    = aa + 49*diag[i] + 49;
2753     vi   = aj + diag[i] + 1;
2754     nz   = ai[i+1] - diag[i] - 1;
2755     idt  = 7*i;
2756     s1 = x[idt];   s2 = x[1+idt];
2757     s3 = x[2+idt]; s4 = x[3+idt];
2758     s5 = x[4+idt]; s6 = x[5+idt];
2759     s7 = x[6+idt];
2760     while (nz--) {
2761       idx   = 7*(*vi++);
2762       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2763       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2764       x7    = x[6+idx];
2765       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2766       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2767       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2768       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2769       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2770       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2771       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2772       v += 49;
2773     }
2774     v        = aa + 49*diag[i];
2775     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2776                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2777     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2778                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2779     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2780                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2781     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2782                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2783     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2784                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2785     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2786                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2787     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2788                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2789   }
2790 
2791   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2793   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2794   PetscFunctionReturn(0);
2795 }
2796 
2797 #undef __FUNCT__
2798 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2799 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2800 {
2801     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2802     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2803     PetscErrorCode    ierr;
2804     PetscInt          i,k,nz,idx,jdx,idt;
2805     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2806     const MatScalar   *aa=a->a,*v;
2807     PetscScalar       *x;
2808     const PetscScalar *b;
2809     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2810 
2811     PetscFunctionBegin;
2812     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2813     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2814     /* forward solve the lower triangular */
2815     idx    = 0;
2816     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2817     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2818     for (i=1; i<n; i++) {
2819        v    = aa + bs2*ai[i];
2820        vi   = aj + ai[i];
2821        nz   = ai[i+1] - ai[i];
2822       idx   = bs*i;
2823        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2824        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2825        for(k=0;k<nz;k++) {
2826           jdx   = bs*vi[k];
2827           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2828 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2829           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2830           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2831           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2832 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2833           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2834 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2835 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2836           v   +=  bs2;
2837         }
2838 
2839        x[idx]   = s1;
2840        x[1+idx] = s2;
2841        x[2+idx] = s3;
2842        x[3+idx] = s4;
2843        x[4+idx] = s5;
2844        x[5+idx] = s6;
2845        x[6+idx] = s7;
2846     }
2847 
2848    /* backward solve the upper triangular */
2849   for (i=n-1; i>=0; i--){
2850     v   = aa + bs2*(adiag[i+1]+1);
2851      vi  = aj + adiag[i+1]+1;
2852      nz  = adiag[i] - adiag[i+1]-1;
2853      idt = bs*i;
2854      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2855      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2856     for(k=0;k<nz;k++) {
2857       idx   = bs*vi[k];
2858        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2859        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2860        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2861        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2862        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2863        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2864        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2865        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2866        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2867         v   +=  bs2;
2868     }
2869     /* x = inv_diagonal*x */
2870     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2871     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2872     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2873     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2874     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2875     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2876     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2877   }
2878 
2879   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2880   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2881   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2882   PetscFunctionReturn(0);
2883 }
2884 
2885 #undef __FUNCT__
2886 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2887 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2888 {
2889   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2890   IS                iscol=a->col,isrow=a->row;
2891   PetscErrorCode    ierr;
2892   const PetscInt    *r,*c,*rout,*cout;
2893   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2894   PetscInt          i,nz,idx,idt,idc;
2895   const MatScalar   *aa=a->a,*v;
2896   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2897   const PetscScalar *b;
2898 
2899   PetscFunctionBegin;
2900   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2901   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2902   t  = a->solve_work;
2903 
2904   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2905   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2906 
2907   /* forward solve the lower triangular */
2908   idx    = 6*(*r++);
2909   t[0] = b[idx];   t[1] = b[1+idx];
2910   t[2] = b[2+idx]; t[3] = b[3+idx];
2911   t[4] = b[4+idx]; t[5] = b[5+idx];
2912   for (i=1; i<n; i++) {
2913     v     = aa + 36*ai[i];
2914     vi    = aj + ai[i];
2915     nz    = diag[i] - ai[i];
2916     idx   = 6*(*r++);
2917     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2918     s5  = b[4+idx]; s6 = b[5+idx];
2919     while (nz--) {
2920       idx   = 6*(*vi++);
2921       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2922       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2923       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2924       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2925       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2926       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2927       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2928       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2929       v += 36;
2930     }
2931     idx = 6*i;
2932     t[idx]   = s1;t[1+idx] = s2;
2933     t[2+idx] = s3;t[3+idx] = s4;
2934     t[4+idx] = s5;t[5+idx] = s6;
2935   }
2936   /* backward solve the upper triangular */
2937   for (i=n-1; i>=0; i--){
2938     v    = aa + 36*diag[i] + 36;
2939     vi   = aj + diag[i] + 1;
2940     nz   = ai[i+1] - diag[i] - 1;
2941     idt  = 6*i;
2942     s1 = t[idt];  s2 = t[1+idt];
2943     s3 = t[2+idt];s4 = t[3+idt];
2944     s5 = t[4+idt];s6 = t[5+idt];
2945     while (nz--) {
2946       idx   = 6*(*vi++);
2947       x1    = t[idx];   x2 = t[1+idx];
2948       x3    = t[2+idx]; x4 = t[3+idx];
2949       x5    = t[4+idx]; x6 = t[5+idx];
2950       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2951       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2952       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2953       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2954       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2955       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2956       v += 36;
2957     }
2958     idc = 6*(*c--);
2959     v   = aa + 36*diag[i];
2960     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2961                                  v[18]*s4+v[24]*s5+v[30]*s6;
2962     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2963                                  v[19]*s4+v[25]*s5+v[31]*s6;
2964     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2965                                  v[20]*s4+v[26]*s5+v[32]*s6;
2966     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2967                                  v[21]*s4+v[27]*s5+v[33]*s6;
2968     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2969                                  v[22]*s4+v[28]*s5+v[34]*s6;
2970     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2971                                  v[23]*s4+v[29]*s5+v[35]*s6;
2972   }
2973 
2974   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2975   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2976   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2977   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2978   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2979   PetscFunctionReturn(0);
2980 }
2981 
2982 #undef __FUNCT__
2983 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2984 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2985 {
2986   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2987   IS                iscol=a->col,isrow=a->row;
2988   PetscErrorCode    ierr;
2989   const PetscInt    *r,*c,*rout,*cout;
2990   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2991   PetscInt          i,nz,idx,idt,idc,m;
2992   const MatScalar   *aa=a->a,*v;
2993   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2994   const PetscScalar *b;
2995 
2996   PetscFunctionBegin;
2997   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2998   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2999   t  = a->solve_work;
3000 
3001   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3002   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3003 
3004   /* forward solve the lower triangular */
3005   idx    = 6*r[0];
3006   t[0] = b[idx];   t[1] = b[1+idx];
3007   t[2] = b[2+idx]; t[3] = b[3+idx];
3008   t[4] = b[4+idx]; t[5] = b[5+idx];
3009   for (i=1; i<n; i++) {
3010     v     = aa + 36*ai[i];
3011     vi    = aj + ai[i];
3012     nz    = ai[i+1] - ai[i];
3013     idx   = 6*r[i];
3014     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3015     s5  = b[4+idx]; s6 = b[5+idx];
3016     for(m=0;m<nz;m++){
3017       idx   = 6*vi[m];
3018       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3019       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3020       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3021       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3022       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3023       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3024       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3025       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3026       v += 36;
3027     }
3028     idx = 6*i;
3029     t[idx]   = s1;t[1+idx] = s2;
3030     t[2+idx] = s3;t[3+idx] = s4;
3031     t[4+idx] = s5;t[5+idx] = s6;
3032   }
3033   /* backward solve the upper triangular */
3034   for (i=n-1; i>=0; i--){
3035     v    = aa + 36*(adiag[i+1]+1);
3036     vi   = aj + adiag[i+1]+1;
3037     nz   = adiag[i] - adiag[i+1] - 1;
3038     idt  = 6*i;
3039     s1 = t[idt];  s2 = t[1+idt];
3040     s3 = t[2+idt];s4 = t[3+idt];
3041     s5 = t[4+idt];s6 = t[5+idt];
3042     for(m=0;m<nz;m++){
3043       idx   = 6*vi[m];
3044       x1    = t[idx];   x2 = t[1+idx];
3045       x3    = t[2+idx]; x4 = t[3+idx];
3046       x5    = t[4+idx]; x6 = t[5+idx];
3047       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3048       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3049       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3050       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3051       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3052       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3053       v += 36;
3054     }
3055     idc = 6*c[i];
3056     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3057                                  v[18]*s4+v[24]*s5+v[30]*s6;
3058     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3059                                  v[19]*s4+v[25]*s5+v[31]*s6;
3060     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3061                                  v[20]*s4+v[26]*s5+v[32]*s6;
3062     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3063                                  v[21]*s4+v[27]*s5+v[33]*s6;
3064     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3065                                  v[22]*s4+v[28]*s5+v[34]*s6;
3066     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3067                                  v[23]*s4+v[29]*s5+v[35]*s6;
3068   }
3069 
3070   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3071   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3072   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3073   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3074   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3075   PetscFunctionReturn(0);
3076 }
3077 
3078 #undef __FUNCT__
3079 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3080 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3081 {
3082   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3083   PetscInt          i,nz,idx,idt,jdx;
3084   PetscErrorCode    ierr;
3085   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3086   const MatScalar   *aa=a->a,*v;
3087   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3088   const PetscScalar *b;
3089 
3090   PetscFunctionBegin;
3091   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3092   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3093   /* forward solve the lower triangular */
3094   idx    = 0;
3095   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3096   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3097   for (i=1; i<n; i++) {
3098     v     =  aa + 36*ai[i];
3099     vi    =  aj + ai[i];
3100     nz    =  diag[i] - ai[i];
3101     idx   =  6*i;
3102     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3103     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3104     while (nz--) {
3105       jdx   = 6*(*vi++);
3106       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3107       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3108       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3109       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3110       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3111       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3112       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3113       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3114       v += 36;
3115      }
3116     x[idx]   = s1;
3117     x[1+idx] = s2;
3118     x[2+idx] = s3;
3119     x[3+idx] = s4;
3120     x[4+idx] = s5;
3121     x[5+idx] = s6;
3122   }
3123   /* backward solve the upper triangular */
3124   for (i=n-1; i>=0; i--){
3125     v    = aa + 36*diag[i] + 36;
3126     vi   = aj + diag[i] + 1;
3127     nz   = ai[i+1] - diag[i] - 1;
3128     idt  = 6*i;
3129     s1 = x[idt];   s2 = x[1+idt];
3130     s3 = x[2+idt]; s4 = x[3+idt];
3131     s5 = x[4+idt]; s6 = x[5+idt];
3132     while (nz--) {
3133       idx   = 6*(*vi++);
3134       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3135       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3136       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3137       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3138       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3139       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3140       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3141       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3142       v += 36;
3143     }
3144     v        = aa + 36*diag[i];
3145     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3146     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3147     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3148     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3149     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3150     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3151   }
3152 
3153   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3154   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3155   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3156   PetscFunctionReturn(0);
3157 }
3158 
3159 #undef __FUNCT__
3160 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3161 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3162 {
3163     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3164     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3165     PetscErrorCode    ierr;
3166     PetscInt          i,k,nz,idx,jdx,idt;
3167     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3168     const MatScalar   *aa=a->a,*v;
3169     PetscScalar       *x;
3170     const PetscScalar *b;
3171     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3172 
3173     PetscFunctionBegin;
3174     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3175     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3176     /* forward solve the lower triangular */
3177     idx    = 0;
3178     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3179     x[4] = b[4+idx];x[5] = b[5+idx];
3180     for (i=1; i<n; i++) {
3181        v    = aa + bs2*ai[i];
3182        vi   = aj + ai[i];
3183        nz   = ai[i+1] - ai[i];
3184       idx   = bs*i;
3185        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3186        s5   = b[4+idx];s6 = b[5+idx];
3187        for(k=0;k<nz;k++){
3188           jdx   = bs*vi[k];
3189           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3190 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3191           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3192           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3193           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3194 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3195           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3196 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3197           v   +=  bs2;
3198         }
3199 
3200        x[idx]   = s1;
3201        x[1+idx] = s2;
3202        x[2+idx] = s3;
3203        x[3+idx] = s4;
3204        x[4+idx] = s5;
3205        x[5+idx] = s6;
3206     }
3207 
3208    /* backward solve the upper triangular */
3209   for (i=n-1; i>=0; i--){
3210     v   = aa + bs2*(adiag[i+1]+1);
3211      vi  = aj + adiag[i+1]+1;
3212      nz  = adiag[i] - adiag[i+1]-1;
3213      idt = bs*i;
3214      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3215      s5 = x[4+idt];s6 = x[5+idt];
3216      for(k=0;k<nz;k++){
3217       idx   = bs*vi[k];
3218        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3219        x5    = x[4+idx];x6 = x[5+idx];
3220        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3221        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3222        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3223        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3224        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3225        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3226         v   +=  bs2;
3227     }
3228     /* x = inv_diagonal*x */
3229    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3230    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3231    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3232    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3233    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3234    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3235   }
3236 
3237   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3238   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3239   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3240   PetscFunctionReturn(0);
3241 }
3242 
3243 #undef __FUNCT__
3244 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3245 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3246 {
3247   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3248   IS                iscol=a->col,isrow=a->row;
3249   PetscErrorCode    ierr;
3250   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3251   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3252   PetscInt          i,nz,idx,idt,idc;
3253   const MatScalar   *aa=a->a,*v;
3254   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3255   const PetscScalar *b;
3256 
3257   PetscFunctionBegin;
3258   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3259   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3260   t  = a->solve_work;
3261 
3262   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3263   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3264 
3265   /* forward solve the lower triangular */
3266   idx    = 5*(*r++);
3267   t[0] = b[idx];   t[1] = b[1+idx];
3268   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3269   for (i=1; i<n; i++) {
3270     v     = aa + 25*ai[i];
3271     vi    = aj + ai[i];
3272     nz    = diag[i] - ai[i];
3273     idx   = 5*(*r++);
3274     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3275     s5  = b[4+idx];
3276     while (nz--) {
3277       idx   = 5*(*vi++);
3278       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3279       x4    = t[3+idx];x5 = t[4+idx];
3280       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3281       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3282       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3283       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3284       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3285       v += 25;
3286     }
3287     idx = 5*i;
3288     t[idx]   = s1;t[1+idx] = s2;
3289     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3290   }
3291   /* backward solve the upper triangular */
3292   for (i=n-1; i>=0; i--){
3293     v    = aa + 25*diag[i] + 25;
3294     vi   = aj + diag[i] + 1;
3295     nz   = ai[i+1] - diag[i] - 1;
3296     idt  = 5*i;
3297     s1 = t[idt];  s2 = t[1+idt];
3298     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3299     while (nz--) {
3300       idx   = 5*(*vi++);
3301       x1    = t[idx];   x2 = t[1+idx];
3302       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3303       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3304       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3305       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3306       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3307       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3308       v += 25;
3309     }
3310     idc = 5*(*c--);
3311     v   = aa + 25*diag[i];
3312     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3313                                  v[15]*s4+v[20]*s5;
3314     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3315                                  v[16]*s4+v[21]*s5;
3316     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3317                                  v[17]*s4+v[22]*s5;
3318     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3319                                  v[18]*s4+v[23]*s5;
3320     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3321                                  v[19]*s4+v[24]*s5;
3322   }
3323 
3324   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3325   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3326   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3327   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3328   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3329   PetscFunctionReturn(0);
3330 }
3331 
3332 #undef __FUNCT__
3333 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3334 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3335 {
3336   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3337   IS                iscol=a->col,isrow=a->row;
3338   PetscErrorCode    ierr;
3339   const PetscInt    *r,*c,*rout,*cout;
3340   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3341   PetscInt          i,nz,idx,idt,idc,m;
3342   const MatScalar   *aa=a->a,*v;
3343   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3344   const PetscScalar *b;
3345 
3346   PetscFunctionBegin;
3347   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3348   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3349   t  = a->solve_work;
3350 
3351   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3352   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3353 
3354   /* forward solve the lower triangular */
3355   idx    = 5*r[0];
3356   t[0] = b[idx];   t[1] = b[1+idx];
3357   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3358   for (i=1; i<n; i++) {
3359     v     = aa + 25*ai[i];
3360     vi    = aj + ai[i];
3361     nz    = ai[i+1] - ai[i];
3362     idx   = 5*r[i];
3363     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3364     s5  = b[4+idx];
3365     for(m=0;m<nz;m++){
3366       idx   = 5*vi[m];
3367       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3368       x4    = t[3+idx];x5 = t[4+idx];
3369       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3370       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3371       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3372       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3373       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3374       v += 25;
3375     }
3376     idx = 5*i;
3377     t[idx]   = s1;t[1+idx] = s2;
3378     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3379   }
3380   /* backward solve the upper triangular */
3381   for (i=n-1; i>=0; i--){
3382     v    = aa + 25*(adiag[i+1]+1);
3383     vi   = aj + adiag[i+1]+1;
3384     nz   = adiag[i] - adiag[i+1] - 1;
3385     idt  = 5*i;
3386     s1 = t[idt];  s2 = t[1+idt];
3387     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3388     for(m=0;m<nz;m++){
3389       idx   = 5*vi[m];
3390       x1    = t[idx];   x2 = t[1+idx];
3391       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3392       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3393       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3394       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3395       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3396       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3397       v += 25;
3398     }
3399     idc = 5*c[i];
3400     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3401                                  v[15]*s4+v[20]*s5;
3402     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3403                                  v[16]*s4+v[21]*s5;
3404     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3405                                  v[17]*s4+v[22]*s5;
3406     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3407                                  v[18]*s4+v[23]*s5;
3408     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3409                                  v[19]*s4+v[24]*s5;
3410   }
3411 
3412   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3413   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3414   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3415   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3416   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3417   PetscFunctionReturn(0);
3418 }
3419 
3420 #undef __FUNCT__
3421 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3422 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3423 {
3424   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3425   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3426   PetscInt          i,nz,idx,idt,jdx;
3427   PetscErrorCode    ierr;
3428   const MatScalar   *aa=a->a,*v;
3429   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3430   const PetscScalar *b;
3431 
3432   PetscFunctionBegin;
3433   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3434   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3435   /* forward solve the lower triangular */
3436   idx    = 0;
3437   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3438   for (i=1; i<n; i++) {
3439     v     =  aa + 25*ai[i];
3440     vi    =  aj + ai[i];
3441     nz    =  diag[i] - ai[i];
3442     idx   =  5*i;
3443     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3444     while (nz--) {
3445       jdx   = 5*(*vi++);
3446       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3447       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3448       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3449       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3450       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3451       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3452       v    += 25;
3453     }
3454     x[idx]   = s1;
3455     x[1+idx] = s2;
3456     x[2+idx] = s3;
3457     x[3+idx] = s4;
3458     x[4+idx] = s5;
3459   }
3460   /* backward solve the upper triangular */
3461   for (i=n-1; i>=0; i--){
3462     v    = aa + 25*diag[i] + 25;
3463     vi   = aj + diag[i] + 1;
3464     nz   = ai[i+1] - diag[i] - 1;
3465     idt  = 5*i;
3466     s1 = x[idt];  s2 = x[1+idt];
3467     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3468     while (nz--) {
3469       idx   = 5*(*vi++);
3470       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3471       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3472       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3473       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3474       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3475       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3476       v    += 25;
3477     }
3478     v        = aa + 25*diag[i];
3479     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3480     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3481     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3482     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3483     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3484   }
3485 
3486   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3487   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3488   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3489   PetscFunctionReturn(0);
3490 }
3491 
3492 #undef __FUNCT__
3493 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3494 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3495 {
3496   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3497   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3498   PetscInt          i,k,nz,idx,idt,jdx;
3499   PetscErrorCode    ierr;
3500   const MatScalar   *aa=a->a,*v;
3501   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3502   const PetscScalar *b;
3503 
3504   PetscFunctionBegin;
3505   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3506   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3507   /* forward solve the lower triangular */
3508   idx    = 0;
3509   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3510   for (i=1; i<n; i++) {
3511     v   = aa + 25*ai[i];
3512     vi  = aj + ai[i];
3513     nz  = ai[i+1] - ai[i];
3514     idx = 5*i;
3515     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3516     for(k=0;k<nz;k++) {
3517       jdx   = 5*vi[k];
3518       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3519       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3520       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3521       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3522       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3523       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3524       v    += 25;
3525     }
3526     x[idx]   = s1;
3527     x[1+idx] = s2;
3528     x[2+idx] = s3;
3529     x[3+idx] = s4;
3530     x[4+idx] = s5;
3531   }
3532 
3533   /* backward solve the upper triangular */
3534   for (i=n-1; i>=0; i--){
3535     v   = aa + 25*(adiag[i+1]+1);
3536     vi  = aj + adiag[i+1]+1;
3537     nz  = adiag[i] - adiag[i+1]-1;
3538     idt = 5*i;
3539     s1 = x[idt];  s2 = x[1+idt];
3540     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3541     for(k=0;k<nz;k++){
3542       idx   = 5*vi[k];
3543       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3544       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3545       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3546       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3547       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3548       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3549       v    += 25;
3550     }
3551     /* x = inv_diagonal*x */
3552     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3553     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3554     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3555     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3556     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3557   }
3558 
3559   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3561   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3562   PetscFunctionReturn(0);
3563 }
3564 
3565 #undef __FUNCT__
3566 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3567 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3568 {
3569   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3570   IS                iscol=a->col,isrow=a->row;
3571   PetscErrorCode    ierr;
3572   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3573   PetscInt          i,nz,idx,idt,idc;
3574   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3575   const MatScalar   *aa=a->a,*v;
3576   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3577   const PetscScalar *b;
3578 
3579   PetscFunctionBegin;
3580   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3581   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3582   t  = a->solve_work;
3583 
3584   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3585   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3586 
3587   /* forward solve the lower triangular */
3588   idx    = 4*(*r++);
3589   t[0] = b[idx];   t[1] = b[1+idx];
3590   t[2] = b[2+idx]; t[3] = b[3+idx];
3591   for (i=1; i<n; i++) {
3592     v     = aa + 16*ai[i];
3593     vi    = aj + ai[i];
3594     nz    = diag[i] - ai[i];
3595     idx   = 4*(*r++);
3596     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3597     while (nz--) {
3598       idx   = 4*(*vi++);
3599       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3600       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3601       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3602       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3603       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3604       v    += 16;
3605     }
3606     idx        = 4*i;
3607     t[idx]   = s1;t[1+idx] = s2;
3608     t[2+idx] = s3;t[3+idx] = s4;
3609   }
3610   /* backward solve the upper triangular */
3611   for (i=n-1; i>=0; i--){
3612     v    = aa + 16*diag[i] + 16;
3613     vi   = aj + diag[i] + 1;
3614     nz   = ai[i+1] - diag[i] - 1;
3615     idt  = 4*i;
3616     s1 = t[idt];  s2 = t[1+idt];
3617     s3 = t[2+idt];s4 = t[3+idt];
3618     while (nz--) {
3619       idx   = 4*(*vi++);
3620       x1    = t[idx];   x2 = t[1+idx];
3621       x3    = t[2+idx]; x4 = t[3+idx];
3622       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3623       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3624       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3625       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3626       v += 16;
3627     }
3628     idc      = 4*(*c--);
3629     v        = aa + 16*diag[i];
3630     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3631     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3632     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3633     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3634   }
3635 
3636   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3637   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3638   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3639   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3640   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3641   PetscFunctionReturn(0);
3642 }
3643 
3644 #undef __FUNCT__
3645 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3646 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3647 {
3648   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3649   IS                iscol=a->col,isrow=a->row;
3650   PetscErrorCode    ierr;
3651   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3652   PetscInt          i,nz,idx,idt,idc,m;
3653   const PetscInt    *r,*c,*rout,*cout;
3654   const MatScalar   *aa=a->a,*v;
3655   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3656   const PetscScalar *b;
3657 
3658   PetscFunctionBegin;
3659   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3660   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3661   t  = a->solve_work;
3662 
3663   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3664   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3665 
3666   /* forward solve the lower triangular */
3667   idx    = 4*r[0];
3668   t[0] = b[idx];   t[1] = b[1+idx];
3669   t[2] = b[2+idx]; t[3] = b[3+idx];
3670   for (i=1; i<n; i++) {
3671     v     = aa + 16*ai[i];
3672     vi    = aj + ai[i];
3673     nz    = ai[i+1] - ai[i];
3674     idx   = 4*r[i];
3675     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3676     for(m=0;m<nz;m++){
3677       idx   = 4*vi[m];
3678       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3679       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3680       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3681       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3682       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3683       v    += 16;
3684     }
3685     idx        = 4*i;
3686     t[idx]   = s1;t[1+idx] = s2;
3687     t[2+idx] = s3;t[3+idx] = s4;
3688   }
3689   /* backward solve the upper triangular */
3690   for (i=n-1; i>=0; i--){
3691     v    = aa + 16*(adiag[i+1]+1);
3692     vi   = aj + adiag[i+1]+1;
3693     nz   = adiag[i] - adiag[i+1] - 1;
3694     idt  = 4*i;
3695     s1 = t[idt];  s2 = t[1+idt];
3696     s3 = t[2+idt];s4 = t[3+idt];
3697     for(m=0;m<nz;m++){
3698       idx   = 4*vi[m];
3699       x1    = t[idx];   x2 = t[1+idx];
3700       x3    = t[2+idx]; x4 = t[3+idx];
3701       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3702       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3703       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3704       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3705       v += 16;
3706     }
3707     idc      = 4*c[i];
3708     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3709     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3710     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3711     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3712   }
3713 
3714   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3715   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3716   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3717   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3718   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 #undef __FUNCT__
3723 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3724 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3725 {
3726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3727   IS                iscol=a->col,isrow=a->row;
3728   PetscErrorCode    ierr;
3729   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3730   PetscInt          i,nz,idx,idt,idc;
3731   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3732   const MatScalar   *aa=a->a,*v;
3733   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3734   PetscScalar       *x;
3735   const PetscScalar *b;
3736 
3737   PetscFunctionBegin;
3738   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3739   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3740   t  = (MatScalar *)a->solve_work;
3741 
3742   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3743   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3744 
3745   /* forward solve the lower triangular */
3746   idx    = 4*(*r++);
3747   t[0] = (MatScalar)b[idx];
3748   t[1] = (MatScalar)b[1+idx];
3749   t[2] = (MatScalar)b[2+idx];
3750   t[3] = (MatScalar)b[3+idx];
3751   for (i=1; i<n; i++) {
3752     v     = aa + 16*ai[i];
3753     vi    = aj + ai[i];
3754     nz    = diag[i] - ai[i];
3755     idx   = 4*(*r++);
3756     s1 = (MatScalar)b[idx];
3757     s2 = (MatScalar)b[1+idx];
3758     s3 = (MatScalar)b[2+idx];
3759     s4 = (MatScalar)b[3+idx];
3760     while (nz--) {
3761       idx   = 4*(*vi++);
3762       x1  = t[idx];
3763       x2  = t[1+idx];
3764       x3  = t[2+idx];
3765       x4  = t[3+idx];
3766       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3767       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3768       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3769       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3770       v    += 16;
3771     }
3772     idx        = 4*i;
3773     t[idx]   = s1;
3774     t[1+idx] = s2;
3775     t[2+idx] = s3;
3776     t[3+idx] = s4;
3777   }
3778   /* backward solve the upper triangular */
3779   for (i=n-1; i>=0; i--){
3780     v    = aa + 16*diag[i] + 16;
3781     vi   = aj + diag[i] + 1;
3782     nz   = ai[i+1] - diag[i] - 1;
3783     idt  = 4*i;
3784     s1 = t[idt];
3785     s2 = t[1+idt];
3786     s3 = t[2+idt];
3787     s4 = t[3+idt];
3788     while (nz--) {
3789       idx   = 4*(*vi++);
3790       x1  = t[idx];
3791       x2  = t[1+idx];
3792       x3  = t[2+idx];
3793       x4  = t[3+idx];
3794       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3795       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3796       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3797       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3798       v += 16;
3799     }
3800     idc      = 4*(*c--);
3801     v        = aa + 16*diag[i];
3802     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3803     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3804     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3805     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3806     x[idc]   = (PetscScalar)t[idt];
3807     x[1+idc] = (PetscScalar)t[1+idt];
3808     x[2+idc] = (PetscScalar)t[2+idt];
3809     x[3+idc] = (PetscScalar)t[3+idt];
3810  }
3811 
3812   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3813   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3814   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3815   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3816   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3817   PetscFunctionReturn(0);
3818 }
3819 
3820 #if defined (PETSC_HAVE_SSE)
3821 
3822 #include PETSC_HAVE_SSE
3823 
3824 #undef __FUNCT__
3825 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3826 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3827 {
3828   /*
3829      Note: This code uses demotion of double
3830      to float when performing the mixed-mode computation.
3831      This may not be numerically reasonable for all applications.
3832   */
3833   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3834   IS             iscol=a->col,isrow=a->row;
3835   PetscErrorCode ierr;
3836   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3837   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3838   MatScalar      *aa=a->a,*v;
3839   PetscScalar    *x,*b,*t;
3840 
3841   /* Make space in temp stack for 16 Byte Aligned arrays */
3842   float           ssealignedspace[11],*tmps,*tmpx;
3843   unsigned long   offset;
3844 
3845   PetscFunctionBegin;
3846   SSE_SCOPE_BEGIN;
3847 
3848     offset = (unsigned long)ssealignedspace % 16;
3849     if (offset) offset = (16 - offset)/4;
3850     tmps = &ssealignedspace[offset];
3851     tmpx = &ssealignedspace[offset+4];
3852     PREFETCH_NTA(aa+16*ai[1]);
3853 
3854     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3855     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3856     t  = a->solve_work;
3857 
3858     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3859     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3860 
3861     /* forward solve the lower triangular */
3862     idx  = 4*(*r++);
3863     t[0] = b[idx];   t[1] = b[1+idx];
3864     t[2] = b[2+idx]; t[3] = b[3+idx];
3865     v    =  aa + 16*ai[1];
3866 
3867     for (i=1; i<n;) {
3868       PREFETCH_NTA(&v[8]);
3869       vi   =  aj      + ai[i];
3870       nz   =  diag[i] - ai[i];
3871       idx  =  4*(*r++);
3872 
3873       /* Demote sum from double to float */
3874       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3875       LOAD_PS(tmps,XMM7);
3876 
3877       while (nz--) {
3878         PREFETCH_NTA(&v[16]);
3879         idx = 4*(*vi++);
3880 
3881         /* Demote solution (so far) from double to float */
3882         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3883 
3884         /* 4x4 Matrix-Vector product with negative accumulation: */
3885         SSE_INLINE_BEGIN_2(tmpx,v)
3886           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3887 
3888           /* First Column */
3889           SSE_COPY_PS(XMM0,XMM6)
3890           SSE_SHUFFLE(XMM0,XMM0,0x00)
3891           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3892           SSE_SUB_PS(XMM7,XMM0)
3893 
3894           /* Second Column */
3895           SSE_COPY_PS(XMM1,XMM6)
3896           SSE_SHUFFLE(XMM1,XMM1,0x55)
3897           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3898           SSE_SUB_PS(XMM7,XMM1)
3899 
3900           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3901 
3902           /* Third Column */
3903           SSE_COPY_PS(XMM2,XMM6)
3904           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3905           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3906           SSE_SUB_PS(XMM7,XMM2)
3907 
3908           /* Fourth Column */
3909           SSE_COPY_PS(XMM3,XMM6)
3910           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3911           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3912           SSE_SUB_PS(XMM7,XMM3)
3913         SSE_INLINE_END_2
3914 
3915         v  += 16;
3916       }
3917       idx = 4*i;
3918       v   = aa + 16*ai[++i];
3919       PREFETCH_NTA(v);
3920       STORE_PS(tmps,XMM7);
3921 
3922       /* Promote result from float to double */
3923       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3924     }
3925     /* backward solve the upper triangular */
3926     idt  = 4*(n-1);
3927     ai16 = 16*diag[n-1];
3928     v    = aa + ai16 + 16;
3929     for (i=n-1; i>=0;){
3930       PREFETCH_NTA(&v[8]);
3931       vi = aj + diag[i] + 1;
3932       nz = ai[i+1] - diag[i] - 1;
3933 
3934       /* Demote accumulator from double to float */
3935       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3936       LOAD_PS(tmps,XMM7);
3937 
3938       while (nz--) {
3939         PREFETCH_NTA(&v[16]);
3940         idx = 4*(*vi++);
3941 
3942         /* Demote solution (so far) from double to float */
3943         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3944 
3945         /* 4x4 Matrix-Vector Product with negative accumulation: */
3946         SSE_INLINE_BEGIN_2(tmpx,v)
3947           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3948 
3949           /* First Column */
3950           SSE_COPY_PS(XMM0,XMM6)
3951           SSE_SHUFFLE(XMM0,XMM0,0x00)
3952           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3953           SSE_SUB_PS(XMM7,XMM0)
3954 
3955           /* Second Column */
3956           SSE_COPY_PS(XMM1,XMM6)
3957           SSE_SHUFFLE(XMM1,XMM1,0x55)
3958           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3959           SSE_SUB_PS(XMM7,XMM1)
3960 
3961           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3962 
3963           /* Third Column */
3964           SSE_COPY_PS(XMM2,XMM6)
3965           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3966           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3967           SSE_SUB_PS(XMM7,XMM2)
3968 
3969           /* Fourth Column */
3970           SSE_COPY_PS(XMM3,XMM6)
3971           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3972           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3973           SSE_SUB_PS(XMM7,XMM3)
3974         SSE_INLINE_END_2
3975         v  += 16;
3976       }
3977       v    = aa + ai16;
3978       ai16 = 16*diag[--i];
3979       PREFETCH_NTA(aa+ai16+16);
3980       /*
3981          Scale the result by the diagonal 4x4 block,
3982          which was inverted as part of the factorization
3983       */
3984       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3985         /* First Column */
3986         SSE_COPY_PS(XMM0,XMM7)
3987         SSE_SHUFFLE(XMM0,XMM0,0x00)
3988         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3989 
3990         /* Second Column */
3991         SSE_COPY_PS(XMM1,XMM7)
3992         SSE_SHUFFLE(XMM1,XMM1,0x55)
3993         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3994         SSE_ADD_PS(XMM0,XMM1)
3995 
3996         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3997 
3998         /* Third Column */
3999         SSE_COPY_PS(XMM2,XMM7)
4000         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4001         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4002         SSE_ADD_PS(XMM0,XMM2)
4003 
4004         /* Fourth Column */
4005         SSE_COPY_PS(XMM3,XMM7)
4006         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4007         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4008         SSE_ADD_PS(XMM0,XMM3)
4009 
4010         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4011       SSE_INLINE_END_3
4012 
4013       /* Promote solution from float to double */
4014       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4015 
4016       /* Apply reordering to t and stream into x.    */
4017       /* This way, x doesn't pollute the cache.      */
4018       /* Be careful with size: 2 doubles = 4 floats! */
4019       idc  = 4*(*c--);
4020       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4021         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4022         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4023         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4024         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4025         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4026         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4027       SSE_INLINE_END_2
4028       v    = aa + ai16 + 16;
4029       idt -= 4;
4030     }
4031 
4032     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4033     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4034     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4035     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4036     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4037   SSE_SCOPE_END;
4038   PetscFunctionReturn(0);
4039 }
4040 
4041 #endif
4042 
4043 
4044 /*
4045       Special case where the matrix was ILU(0) factored in the natural
4046    ordering. This eliminates the need for the column and row permutation.
4047 */
4048 #undef __FUNCT__
4049 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4050 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4051 {
4052   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4053   PetscInt          n=a->mbs;
4054   const PetscInt    *ai=a->i,*aj=a->j;
4055   PetscErrorCode    ierr;
4056   const PetscInt    *diag = a->diag;
4057   const MatScalar   *aa=a->a;
4058   PetscScalar       *x;
4059   const PetscScalar *b;
4060 
4061   PetscFunctionBegin;
4062   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4063   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4064 
4065 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4066   {
4067     static PetscScalar w[2000]; /* very BAD need to fix */
4068     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4069   }
4070 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4071   {
4072     static PetscScalar w[2000]; /* very BAD need to fix */
4073     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4074   }
4075 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4076   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4077 #else
4078   {
4079     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4080     const MatScalar *v;
4081     PetscInt        jdx,idt,idx,nz,i,ai16;
4082     const PetscInt  *vi;
4083 
4084   /* forward solve the lower triangular */
4085   idx    = 0;
4086   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4087   for (i=1; i<n; i++) {
4088     v     =  aa      + 16*ai[i];
4089     vi    =  aj      + ai[i];
4090     nz    =  diag[i] - ai[i];
4091     idx   +=  4;
4092     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4093     while (nz--) {
4094       jdx   = 4*(*vi++);
4095       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4096       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4097       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4098       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4099       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4100       v    += 16;
4101     }
4102     x[idx]   = s1;
4103     x[1+idx] = s2;
4104     x[2+idx] = s3;
4105     x[3+idx] = s4;
4106   }
4107   /* backward solve the upper triangular */
4108   idt = 4*(n-1);
4109   for (i=n-1; i>=0; i--){
4110     ai16 = 16*diag[i];
4111     v    = aa + ai16 + 16;
4112     vi   = aj + diag[i] + 1;
4113     nz   = ai[i+1] - diag[i] - 1;
4114     s1 = x[idt];  s2 = x[1+idt];
4115     s3 = x[2+idt];s4 = x[3+idt];
4116     while (nz--) {
4117       idx   = 4*(*vi++);
4118       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4119       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4120       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4121       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4122       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4123       v    += 16;
4124     }
4125     v        = aa + ai16;
4126     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4127     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4128     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4129     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4130     idt -= 4;
4131   }
4132   }
4133 #endif
4134 
4135   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4136   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4137   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4138   PetscFunctionReturn(0);
4139 }
4140 
4141 #undef __FUNCT__
4142 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4143 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4144 {
4145     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4146     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4147     PetscInt          i,k,nz,idx,jdx,idt;
4148     PetscErrorCode    ierr;
4149     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4150     const MatScalar   *aa=a->a,*v;
4151     PetscScalar       *x;
4152     const PetscScalar *b;
4153     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4154 
4155     PetscFunctionBegin;
4156     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4157     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4158     /* forward solve the lower triangular */
4159     idx    = 0;
4160     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4161     for (i=1; i<n; i++) {
4162        v    = aa + bs2*ai[i];
4163        vi   = aj + ai[i];
4164        nz   = ai[i+1] - ai[i];
4165       idx   = bs*i;
4166        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4167       for(k=0;k<nz;k++) {
4168           jdx   = bs*vi[k];
4169           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4170           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4171           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4172           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4173 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4174 
4175           v   +=  bs2;
4176         }
4177 
4178        x[idx]   = s1;
4179        x[1+idx] = s2;
4180        x[2+idx] = s3;
4181        x[3+idx] = s4;
4182     }
4183 
4184    /* backward solve the upper triangular */
4185   for (i=n-1; i>=0; i--){
4186     v   = aa + bs2*(adiag[i+1]+1);
4187      vi  = aj + adiag[i+1]+1;
4188      nz  = adiag[i] - adiag[i+1]-1;
4189      idt = bs*i;
4190      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4191 
4192     for(k=0;k<nz;k++){
4193       idx   = bs*vi[k];
4194        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4195        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4196        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4197        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4198        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4199 
4200         v   +=  bs2;
4201     }
4202     /* x = inv_diagonal*x */
4203    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4204    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4205    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4206    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4207 
4208   }
4209 
4210   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4211   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4212   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4213   PetscFunctionReturn(0);
4214 }
4215 
4216 #undef __FUNCT__
4217 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4218 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4219 {
4220   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4221   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4222   PetscErrorCode    ierr;
4223   const MatScalar   *aa=a->a;
4224   const PetscScalar *b;
4225   PetscScalar       *x;
4226 
4227   PetscFunctionBegin;
4228   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4229   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4230 
4231   {
4232     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4233     const MatScalar  *v;
4234     MatScalar        *t=(MatScalar *)x;
4235     PetscInt         jdx,idt,idx,nz,i,ai16;
4236     const PetscInt   *vi;
4237 
4238     /* forward solve the lower triangular */
4239     idx  = 0;
4240     t[0] = (MatScalar)b[0];
4241     t[1] = (MatScalar)b[1];
4242     t[2] = (MatScalar)b[2];
4243     t[3] = (MatScalar)b[3];
4244     for (i=1; i<n; i++) {
4245       v     =  aa      + 16*ai[i];
4246       vi    =  aj      + ai[i];
4247       nz    =  diag[i] - ai[i];
4248       idx   +=  4;
4249       s1 = (MatScalar)b[idx];
4250       s2 = (MatScalar)b[1+idx];
4251       s3 = (MatScalar)b[2+idx];
4252       s4 = (MatScalar)b[3+idx];
4253       while (nz--) {
4254         jdx = 4*(*vi++);
4255         x1  = t[jdx];
4256         x2  = t[1+jdx];
4257         x3  = t[2+jdx];
4258         x4  = t[3+jdx];
4259         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4260         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4261         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4262         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4263         v    += 16;
4264       }
4265       t[idx]   = s1;
4266       t[1+idx] = s2;
4267       t[2+idx] = s3;
4268       t[3+idx] = s4;
4269     }
4270     /* backward solve the upper triangular */
4271     idt = 4*(n-1);
4272     for (i=n-1; i>=0; i--){
4273       ai16 = 16*diag[i];
4274       v    = aa + ai16 + 16;
4275       vi   = aj + diag[i] + 1;
4276       nz   = ai[i+1] - diag[i] - 1;
4277       s1   = t[idt];
4278       s2   = t[1+idt];
4279       s3   = t[2+idt];
4280       s4   = t[3+idt];
4281       while (nz--) {
4282         idx = 4*(*vi++);
4283         x1  = (MatScalar)x[idx];
4284         x2  = (MatScalar)x[1+idx];
4285         x3  = (MatScalar)x[2+idx];
4286         x4  = (MatScalar)x[3+idx];
4287         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4288         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4289         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4290         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4291         v    += 16;
4292       }
4293       v        = aa + ai16;
4294       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4295       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4296       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4297       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4298       idt -= 4;
4299     }
4300   }
4301 
4302   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4303   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4304   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4305   PetscFunctionReturn(0);
4306 }
4307 
4308 #if defined (PETSC_HAVE_SSE)
4309 
4310 #include PETSC_HAVE_SSE
4311 #undef __FUNCT__
4312 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4313 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4314 {
4315   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4316   unsigned short *aj=(unsigned short *)a->j;
4317   PetscErrorCode ierr;
4318   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4319   MatScalar      *aa=a->a;
4320   PetscScalar    *x,*b;
4321 
4322   PetscFunctionBegin;
4323   SSE_SCOPE_BEGIN;
4324   /*
4325      Note: This code currently uses demotion of double
4326      to float when performing the mixed-mode computation.
4327      This may not be numerically reasonable for all applications.
4328   */
4329   PREFETCH_NTA(aa+16*ai[1]);
4330 
4331   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4332   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4333   {
4334     /* x will first be computed in single precision then promoted inplace to double */
4335     MatScalar      *v,*t=(MatScalar *)x;
4336     int            nz,i,idt,ai16;
4337     unsigned int   jdx,idx;
4338     unsigned short *vi;
4339     /* Forward solve the lower triangular factor. */
4340 
4341     /* First block is the identity. */
4342     idx  = 0;
4343     CONVERT_DOUBLE4_FLOAT4(t,b);
4344     v    =  aa + 16*((unsigned int)ai[1]);
4345 
4346     for (i=1; i<n;) {
4347       PREFETCH_NTA(&v[8]);
4348       vi   =  aj      + ai[i];
4349       nz   =  diag[i] - ai[i];
4350       idx +=  4;
4351 
4352       /* Demote RHS from double to float. */
4353       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4354       LOAD_PS(&t[idx],XMM7);
4355 
4356       while (nz--) {
4357         PREFETCH_NTA(&v[16]);
4358         jdx = 4*((unsigned int)(*vi++));
4359 
4360         /* 4x4 Matrix-Vector product with negative accumulation: */
4361         SSE_INLINE_BEGIN_2(&t[jdx],v)
4362           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4363 
4364           /* First Column */
4365           SSE_COPY_PS(XMM0,XMM6)
4366           SSE_SHUFFLE(XMM0,XMM0,0x00)
4367           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4368           SSE_SUB_PS(XMM7,XMM0)
4369 
4370           /* Second Column */
4371           SSE_COPY_PS(XMM1,XMM6)
4372           SSE_SHUFFLE(XMM1,XMM1,0x55)
4373           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4374           SSE_SUB_PS(XMM7,XMM1)
4375 
4376           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4377 
4378           /* Third Column */
4379           SSE_COPY_PS(XMM2,XMM6)
4380           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4381           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4382           SSE_SUB_PS(XMM7,XMM2)
4383 
4384           /* Fourth Column */
4385           SSE_COPY_PS(XMM3,XMM6)
4386           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4387           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4388           SSE_SUB_PS(XMM7,XMM3)
4389         SSE_INLINE_END_2
4390 
4391         v  += 16;
4392       }
4393       v    =  aa + 16*ai[++i];
4394       PREFETCH_NTA(v);
4395       STORE_PS(&t[idx],XMM7);
4396     }
4397 
4398     /* Backward solve the upper triangular factor.*/
4399 
4400     idt  = 4*(n-1);
4401     ai16 = 16*diag[n-1];
4402     v    = aa + ai16 + 16;
4403     for (i=n-1; i>=0;){
4404       PREFETCH_NTA(&v[8]);
4405       vi = aj + diag[i] + 1;
4406       nz = ai[i+1] - diag[i] - 1;
4407 
4408       LOAD_PS(&t[idt],XMM7);
4409 
4410       while (nz--) {
4411         PREFETCH_NTA(&v[16]);
4412         idx = 4*((unsigned int)(*vi++));
4413 
4414         /* 4x4 Matrix-Vector Product with negative accumulation: */
4415         SSE_INLINE_BEGIN_2(&t[idx],v)
4416           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4417 
4418           /* First Column */
4419           SSE_COPY_PS(XMM0,XMM6)
4420           SSE_SHUFFLE(XMM0,XMM0,0x00)
4421           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4422           SSE_SUB_PS(XMM7,XMM0)
4423 
4424           /* Second Column */
4425           SSE_COPY_PS(XMM1,XMM6)
4426           SSE_SHUFFLE(XMM1,XMM1,0x55)
4427           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4428           SSE_SUB_PS(XMM7,XMM1)
4429 
4430           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4431 
4432           /* Third Column */
4433           SSE_COPY_PS(XMM2,XMM6)
4434           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4435           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4436           SSE_SUB_PS(XMM7,XMM2)
4437 
4438           /* Fourth Column */
4439           SSE_COPY_PS(XMM3,XMM6)
4440           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4441           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4442           SSE_SUB_PS(XMM7,XMM3)
4443         SSE_INLINE_END_2
4444         v  += 16;
4445       }
4446       v    = aa + ai16;
4447       ai16 = 16*diag[--i];
4448       PREFETCH_NTA(aa+ai16+16);
4449       /*
4450          Scale the result by the diagonal 4x4 block,
4451          which was inverted as part of the factorization
4452       */
4453       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4454         /* First Column */
4455         SSE_COPY_PS(XMM0,XMM7)
4456         SSE_SHUFFLE(XMM0,XMM0,0x00)
4457         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4458 
4459         /* Second Column */
4460         SSE_COPY_PS(XMM1,XMM7)
4461         SSE_SHUFFLE(XMM1,XMM1,0x55)
4462         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4463         SSE_ADD_PS(XMM0,XMM1)
4464 
4465         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4466 
4467         /* Third Column */
4468         SSE_COPY_PS(XMM2,XMM7)
4469         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4470         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4471         SSE_ADD_PS(XMM0,XMM2)
4472 
4473         /* Fourth Column */
4474         SSE_COPY_PS(XMM3,XMM7)
4475         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4476         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4477         SSE_ADD_PS(XMM0,XMM3)
4478 
4479         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4480       SSE_INLINE_END_3
4481 
4482       v    = aa + ai16 + 16;
4483       idt -= 4;
4484     }
4485 
4486     /* Convert t from single precision back to double precision (inplace)*/
4487     idt = 4*(n-1);
4488     for (i=n-1;i>=0;i--) {
4489       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4490       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4491       PetscScalar *xtemp=&x[idt];
4492       MatScalar   *ttemp=&t[idt];
4493       xtemp[3] = (PetscScalar)ttemp[3];
4494       xtemp[2] = (PetscScalar)ttemp[2];
4495       xtemp[1] = (PetscScalar)ttemp[1];
4496       xtemp[0] = (PetscScalar)ttemp[0];
4497       idt -= 4;
4498     }
4499 
4500   } /* End of artificial scope. */
4501   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4502   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4503   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4504   SSE_SCOPE_END;
4505   PetscFunctionReturn(0);
4506 }
4507 
4508 #undef __FUNCT__
4509 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4510 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4511 {
4512   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4513   int            *aj=a->j;
4514   PetscErrorCode ierr;
4515   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4516   MatScalar      *aa=a->a;
4517   PetscScalar    *x,*b;
4518 
4519   PetscFunctionBegin;
4520   SSE_SCOPE_BEGIN;
4521   /*
4522      Note: This code currently uses demotion of double
4523      to float when performing the mixed-mode computation.
4524      This may not be numerically reasonable for all applications.
4525   */
4526   PREFETCH_NTA(aa+16*ai[1]);
4527 
4528   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4529   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4530   {
4531     /* x will first be computed in single precision then promoted inplace to double */
4532     MatScalar *v,*t=(MatScalar *)x;
4533     int       nz,i,idt,ai16;
4534     int       jdx,idx;
4535     int       *vi;
4536     /* Forward solve the lower triangular factor. */
4537 
4538     /* First block is the identity. */
4539     idx  = 0;
4540     CONVERT_DOUBLE4_FLOAT4(t,b);
4541     v    =  aa + 16*ai[1];
4542 
4543     for (i=1; i<n;) {
4544       PREFETCH_NTA(&v[8]);
4545       vi   =  aj      + ai[i];
4546       nz   =  diag[i] - ai[i];
4547       idx +=  4;
4548 
4549       /* Demote RHS from double to float. */
4550       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4551       LOAD_PS(&t[idx],XMM7);
4552 
4553       while (nz--) {
4554         PREFETCH_NTA(&v[16]);
4555         jdx = 4*(*vi++);
4556 /*          jdx = *vi++; */
4557 
4558         /* 4x4 Matrix-Vector product with negative accumulation: */
4559         SSE_INLINE_BEGIN_2(&t[jdx],v)
4560           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4561 
4562           /* First Column */
4563           SSE_COPY_PS(XMM0,XMM6)
4564           SSE_SHUFFLE(XMM0,XMM0,0x00)
4565           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4566           SSE_SUB_PS(XMM7,XMM0)
4567 
4568           /* Second Column */
4569           SSE_COPY_PS(XMM1,XMM6)
4570           SSE_SHUFFLE(XMM1,XMM1,0x55)
4571           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4572           SSE_SUB_PS(XMM7,XMM1)
4573 
4574           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4575 
4576           /* Third Column */
4577           SSE_COPY_PS(XMM2,XMM6)
4578           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4579           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4580           SSE_SUB_PS(XMM7,XMM2)
4581 
4582           /* Fourth Column */
4583           SSE_COPY_PS(XMM3,XMM6)
4584           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4585           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4586           SSE_SUB_PS(XMM7,XMM3)
4587         SSE_INLINE_END_2
4588 
4589         v  += 16;
4590       }
4591       v    =  aa + 16*ai[++i];
4592       PREFETCH_NTA(v);
4593       STORE_PS(&t[idx],XMM7);
4594     }
4595 
4596     /* Backward solve the upper triangular factor.*/
4597 
4598     idt  = 4*(n-1);
4599     ai16 = 16*diag[n-1];
4600     v    = aa + ai16 + 16;
4601     for (i=n-1; i>=0;){
4602       PREFETCH_NTA(&v[8]);
4603       vi = aj + diag[i] + 1;
4604       nz = ai[i+1] - diag[i] - 1;
4605 
4606       LOAD_PS(&t[idt],XMM7);
4607 
4608       while (nz--) {
4609         PREFETCH_NTA(&v[16]);
4610         idx = 4*(*vi++);
4611 /*          idx = *vi++; */
4612 
4613         /* 4x4 Matrix-Vector Product with negative accumulation: */
4614         SSE_INLINE_BEGIN_2(&t[idx],v)
4615           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4616 
4617           /* First Column */
4618           SSE_COPY_PS(XMM0,XMM6)
4619           SSE_SHUFFLE(XMM0,XMM0,0x00)
4620           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4621           SSE_SUB_PS(XMM7,XMM0)
4622 
4623           /* Second Column */
4624           SSE_COPY_PS(XMM1,XMM6)
4625           SSE_SHUFFLE(XMM1,XMM1,0x55)
4626           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4627           SSE_SUB_PS(XMM7,XMM1)
4628 
4629           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4630 
4631           /* Third Column */
4632           SSE_COPY_PS(XMM2,XMM6)
4633           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4634           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4635           SSE_SUB_PS(XMM7,XMM2)
4636 
4637           /* Fourth Column */
4638           SSE_COPY_PS(XMM3,XMM6)
4639           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4640           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4641           SSE_SUB_PS(XMM7,XMM3)
4642         SSE_INLINE_END_2
4643         v  += 16;
4644       }
4645       v    = aa + ai16;
4646       ai16 = 16*diag[--i];
4647       PREFETCH_NTA(aa+ai16+16);
4648       /*
4649          Scale the result by the diagonal 4x4 block,
4650          which was inverted as part of the factorization
4651       */
4652       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4653         /* First Column */
4654         SSE_COPY_PS(XMM0,XMM7)
4655         SSE_SHUFFLE(XMM0,XMM0,0x00)
4656         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4657 
4658         /* Second Column */
4659         SSE_COPY_PS(XMM1,XMM7)
4660         SSE_SHUFFLE(XMM1,XMM1,0x55)
4661         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4662         SSE_ADD_PS(XMM0,XMM1)
4663 
4664         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4665 
4666         /* Third Column */
4667         SSE_COPY_PS(XMM2,XMM7)
4668         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4669         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4670         SSE_ADD_PS(XMM0,XMM2)
4671 
4672         /* Fourth Column */
4673         SSE_COPY_PS(XMM3,XMM7)
4674         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4675         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4676         SSE_ADD_PS(XMM0,XMM3)
4677 
4678         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4679       SSE_INLINE_END_3
4680 
4681       v    = aa + ai16 + 16;
4682       idt -= 4;
4683     }
4684 
4685     /* Convert t from single precision back to double precision (inplace)*/
4686     idt = 4*(n-1);
4687     for (i=n-1;i>=0;i--) {
4688       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4689       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4690       PetscScalar *xtemp=&x[idt];
4691       MatScalar   *ttemp=&t[idt];
4692       xtemp[3] = (PetscScalar)ttemp[3];
4693       xtemp[2] = (PetscScalar)ttemp[2];
4694       xtemp[1] = (PetscScalar)ttemp[1];
4695       xtemp[0] = (PetscScalar)ttemp[0];
4696       idt -= 4;
4697     }
4698 
4699   } /* End of artificial scope. */
4700   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4701   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4702   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4703   SSE_SCOPE_END;
4704   PetscFunctionReturn(0);
4705 }
4706 
4707 #endif
4708 
4709 #undef __FUNCT__
4710 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4711 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4712 {
4713   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4714   IS                iscol=a->col,isrow=a->row;
4715   PetscErrorCode    ierr;
4716   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4717   PetscInt          i,nz,idx,idt,idc;
4718   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4719   const MatScalar   *aa=a->a,*v;
4720   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4721   const PetscScalar *b;
4722 
4723   PetscFunctionBegin;
4724   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4725   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4726   t  = a->solve_work;
4727 
4728   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4729   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4730 
4731   /* forward solve the lower triangular */
4732   idx    = 3*(*r++);
4733   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4734   for (i=1; i<n; i++) {
4735     v     = aa + 9*ai[i];
4736     vi    = aj + ai[i];
4737     nz    = diag[i] - ai[i];
4738     idx   = 3*(*r++);
4739     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4740     while (nz--) {
4741       idx   = 3*(*vi++);
4742       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4743       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4744       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4745       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4746       v += 9;
4747     }
4748     idx = 3*i;
4749     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4750   }
4751   /* backward solve the upper triangular */
4752   for (i=n-1; i>=0; i--){
4753     v    = aa + 9*diag[i] + 9;
4754     vi   = aj + diag[i] + 1;
4755     nz   = ai[i+1] - diag[i] - 1;
4756     idt  = 3*i;
4757     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4758     while (nz--) {
4759       idx   = 3*(*vi++);
4760       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4761       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4762       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4763       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4764       v += 9;
4765     }
4766     idc = 3*(*c--);
4767     v   = aa + 9*diag[i];
4768     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4769     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4770     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4771   }
4772   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4773   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4774   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4776   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4777   PetscFunctionReturn(0);
4778 }
4779 
4780 #undef __FUNCT__
4781 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4782 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4783 {
4784   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4785   IS                iscol=a->col,isrow=a->row;
4786   PetscErrorCode    ierr;
4787   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4788   PetscInt          i,nz,idx,idt,idc,m;
4789   const PetscInt    *r,*c,*rout,*cout;
4790   const MatScalar   *aa=a->a,*v;
4791   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4792   const PetscScalar *b;
4793 
4794   PetscFunctionBegin;
4795   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4796   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4797   t  = a->solve_work;
4798 
4799   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4800   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4801 
4802   /* forward solve the lower triangular */
4803   idx    = 3*r[0];
4804   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4805   for (i=1; i<n; i++) {
4806     v     = aa + 9*ai[i];
4807     vi    = aj + ai[i];
4808     nz    = ai[i+1] - ai[i];
4809     idx   = 3*r[i];
4810     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4811     for(m=0;m<nz;m++){
4812       idx   = 3*vi[m];
4813       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4814       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4815       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4816       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4817       v += 9;
4818     }
4819     idx = 3*i;
4820     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4821   }
4822   /* backward solve the upper triangular */
4823   for (i=n-1; i>=0; i--){
4824     v    = aa + 9*(adiag[i+1]+1);
4825     vi   = aj + adiag[i+1]+1;
4826     nz   = adiag[i] - adiag[i+1] - 1;
4827     idt  = 3*i;
4828     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4829     for(m=0;m<nz;m++){
4830       idx   = 3*vi[m];
4831       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4832       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4833       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4834       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4835       v += 9;
4836     }
4837     idc = 3*c[i];
4838     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4839     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4840     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4841   }
4842   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4843   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4844   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4845   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4846   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4847   PetscFunctionReturn(0);
4848 }
4849 
4850 /*
4851       Special case where the matrix was ILU(0) factored in the natural
4852    ordering. This eliminates the need for the column and row permutation.
4853 */
4854 #undef __FUNCT__
4855 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4856 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4857 {
4858   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4859   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4860   PetscErrorCode    ierr;
4861   const PetscInt    *diag = a->diag,*vi;
4862   const MatScalar   *aa=a->a,*v;
4863   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4864   const PetscScalar *b;
4865   PetscInt          jdx,idt,idx,nz,i;
4866 
4867   PetscFunctionBegin;
4868   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4869   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4870 
4871   /* forward solve the lower triangular */
4872   idx    = 0;
4873   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4874   for (i=1; i<n; i++) {
4875     v     =  aa      + 9*ai[i];
4876     vi    =  aj      + ai[i];
4877     nz    =  diag[i] - ai[i];
4878     idx   +=  3;
4879     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4880     while (nz--) {
4881       jdx   = 3*(*vi++);
4882       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4883       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4884       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4885       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4886       v    += 9;
4887     }
4888     x[idx]   = s1;
4889     x[1+idx] = s2;
4890     x[2+idx] = s3;
4891   }
4892   /* backward solve the upper triangular */
4893   for (i=n-1; i>=0; i--){
4894     v    = aa + 9*diag[i] + 9;
4895     vi   = aj + diag[i] + 1;
4896     nz   = ai[i+1] - diag[i] - 1;
4897     idt  = 3*i;
4898     s1 = x[idt];  s2 = x[1+idt];
4899     s3 = x[2+idt];
4900     while (nz--) {
4901       idx   = 3*(*vi++);
4902       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4903       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4904       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4905       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4906       v    += 9;
4907     }
4908     v        = aa +  9*diag[i];
4909     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4910     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4911     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4912   }
4913 
4914   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4915   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4916   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4917   PetscFunctionReturn(0);
4918 }
4919 
4920 #undef __FUNCT__
4921 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4922 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4923 {
4924     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4925     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4926     PetscErrorCode    ierr;
4927     PetscInt          i,k,nz,idx,jdx,idt;
4928     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4929     const MatScalar   *aa=a->a,*v;
4930     PetscScalar       *x;
4931     const PetscScalar *b;
4932     PetscScalar        s1,s2,s3,x1,x2,x3;
4933 
4934     PetscFunctionBegin;
4935     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4936     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4937     /* forward solve the lower triangular */
4938     idx    = 0;
4939     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4940     for (i=1; i<n; i++) {
4941        v    = aa + bs2*ai[i];
4942        vi   = aj + ai[i];
4943        nz   = ai[i+1] - ai[i];
4944       idx   = bs*i;
4945        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4946       for(k=0;k<nz;k++){
4947          jdx   = bs*vi[k];
4948           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4949           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4950           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4951           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4952 
4953           v   +=  bs2;
4954         }
4955 
4956        x[idx]   = s1;
4957        x[1+idx] = s2;
4958        x[2+idx] = s3;
4959     }
4960 
4961    /* backward solve the upper triangular */
4962   for (i=n-1; i>=0; i--){
4963     v   = aa + bs2*(adiag[i+1]+1);
4964      vi  = aj + adiag[i+1]+1;
4965      nz  = adiag[i] - adiag[i+1]-1;
4966      idt = bs*i;
4967      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4968 
4969      for(k=0;k<nz;k++){
4970        idx   = bs*vi[k];
4971        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4972        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4973        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4974        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4975 
4976         v   +=  bs2;
4977     }
4978     /* x = inv_diagonal*x */
4979    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4980    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4981    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4982 
4983   }
4984 
4985   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4986   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4987   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4988   PetscFunctionReturn(0);
4989 }
4990 
4991 #undef __FUNCT__
4992 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4993 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4994 {
4995   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4996   IS                iscol=a->col,isrow=a->row;
4997   PetscErrorCode    ierr;
4998   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4999   PetscInt          i,nz,idx,idt,idc;
5000   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5001   const MatScalar   *aa=a->a,*v;
5002   PetscScalar       *x,s1,s2,x1,x2,*t;
5003   const PetscScalar *b;
5004 
5005   PetscFunctionBegin;
5006   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5007   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5008   t  = a->solve_work;
5009 
5010   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5011   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5012 
5013   /* forward solve the lower triangular */
5014   idx    = 2*(*r++);
5015   t[0] = b[idx]; t[1] = b[1+idx];
5016   for (i=1; i<n; i++) {
5017     v     = aa + 4*ai[i];
5018     vi    = aj + ai[i];
5019     nz    = diag[i] - ai[i];
5020     idx   = 2*(*r++);
5021     s1  = b[idx]; s2 = b[1+idx];
5022     while (nz--) {
5023       idx   = 2*(*vi++);
5024       x1    = t[idx]; x2 = t[1+idx];
5025       s1 -= v[0]*x1 + v[2]*x2;
5026       s2 -= v[1]*x1 + v[3]*x2;
5027       v += 4;
5028     }
5029     idx = 2*i;
5030     t[idx] = s1; t[1+idx] = s2;
5031   }
5032   /* backward solve the upper triangular */
5033   for (i=n-1; i>=0; i--){
5034     v    = aa + 4*diag[i] + 4;
5035     vi   = aj + diag[i] + 1;
5036     nz   = ai[i+1] - diag[i] - 1;
5037     idt  = 2*i;
5038     s1 = t[idt]; s2 = t[1+idt];
5039     while (nz--) {
5040       idx   = 2*(*vi++);
5041       x1    = t[idx]; x2 = t[1+idx];
5042       s1 -= v[0]*x1 + v[2]*x2;
5043       s2 -= v[1]*x1 + v[3]*x2;
5044       v += 4;
5045     }
5046     idc = 2*(*c--);
5047     v   = aa + 4*diag[i];
5048     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5049     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5050   }
5051   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5052   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5053   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5054   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5055   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5056   PetscFunctionReturn(0);
5057 }
5058 
5059 #undef __FUNCT__
5060 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5061 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5062 {
5063   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5064   IS                iscol=a->col,isrow=a->row;
5065   PetscErrorCode    ierr;
5066   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5067   PetscInt          i,nz,idx,jdx,idt,idc,m;
5068   const PetscInt    *r,*c,*rout,*cout;
5069   const MatScalar   *aa=a->a,*v;
5070   PetscScalar       *x,s1,s2,x1,x2,*t;
5071   const PetscScalar *b;
5072 
5073   PetscFunctionBegin;
5074   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5075   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5076   t  = a->solve_work;
5077 
5078   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5079   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5080 
5081   /* forward solve the lower triangular */
5082   idx    = 2*r[0];
5083   t[0] = b[idx]; t[1] = b[1+idx];
5084   for (i=1; i<n; i++) {
5085     v     = aa + 4*ai[i];
5086     vi    = aj + ai[i];
5087     nz    = ai[i+1] - ai[i];
5088     idx   = 2*r[i];
5089     s1  = b[idx]; s2 = b[1+idx];
5090     for(m=0;m<nz;m++){
5091       jdx   = 2*vi[m];
5092       x1    = t[jdx]; x2 = t[1+jdx];
5093       s1 -= v[0]*x1 + v[2]*x2;
5094       s2 -= v[1]*x1 + v[3]*x2;
5095       v += 4;
5096     }
5097     idx = 2*i;
5098     t[idx] = s1; t[1+idx] = s2;
5099   }
5100   /* backward solve the upper triangular */
5101   for (i=n-1; i>=0; i--){
5102     v    = aa + 4*(adiag[i+1]+1);
5103     vi   = aj + adiag[i+1]+1;
5104     nz   = adiag[i] - adiag[i+1] - 1;
5105     idt  = 2*i;
5106     s1 = t[idt]; s2 = t[1+idt];
5107     for(m=0;m<nz;m++){
5108       idx   = 2*vi[m];
5109       x1    = t[idx]; x2 = t[1+idx];
5110       s1 -= v[0]*x1 + v[2]*x2;
5111       s2 -= v[1]*x1 + v[3]*x2;
5112       v += 4;
5113     }
5114     idc = 2*c[i];
5115     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5116     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5117   }
5118   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5119   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5120   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5121   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5122   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5123   PetscFunctionReturn(0);
5124 }
5125 
5126 /*
5127       Special case where the matrix was ILU(0) factored in the natural
5128    ordering. This eliminates the need for the column and row permutation.
5129 */
5130 #undef __FUNCT__
5131 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5132 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5133 {
5134   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5135   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5136   PetscErrorCode    ierr;
5137   const MatScalar   *aa=a->a,*v;
5138   PetscScalar       *x,s1,s2,x1,x2;
5139   const PetscScalar *b;
5140   PetscInt          jdx,idt,idx,nz,i;
5141 
5142   PetscFunctionBegin;
5143   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5144   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5145 
5146   /* forward solve the lower triangular */
5147   idx    = 0;
5148   x[0]   = b[0]; x[1] = b[1];
5149   for (i=1; i<n; i++) {
5150     v     =  aa      + 4*ai[i];
5151     vi    =  aj      + ai[i];
5152     nz    =  diag[i] - ai[i];
5153     idx   +=  2;
5154     s1  =  b[idx];s2 = b[1+idx];
5155     while (nz--) {
5156       jdx   = 2*(*vi++);
5157       x1    = x[jdx];x2 = x[1+jdx];
5158       s1 -= v[0]*x1 + v[2]*x2;
5159       s2 -= v[1]*x1 + v[3]*x2;
5160       v    += 4;
5161     }
5162     x[idx]   = s1;
5163     x[1+idx] = s2;
5164   }
5165   /* backward solve the upper triangular */
5166   for (i=n-1; i>=0; i--){
5167     v    = aa + 4*diag[i] + 4;
5168     vi   = aj + diag[i] + 1;
5169     nz   = ai[i+1] - diag[i] - 1;
5170     idt  = 2*i;
5171     s1 = x[idt];  s2 = x[1+idt];
5172     while (nz--) {
5173       idx   = 2*(*vi++);
5174       x1    = x[idx];   x2 = x[1+idx];
5175       s1 -= v[0]*x1 + v[2]*x2;
5176       s2 -= v[1]*x1 + v[3]*x2;
5177       v    += 4;
5178     }
5179     v        = aa +  4*diag[i];
5180     x[idt]   = v[0]*s1 + v[2]*s2;
5181     x[1+idt] = v[1]*s1 + v[3]*s2;
5182   }
5183 
5184   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5185   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5186   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5187   PetscFunctionReturn(0);
5188 }
5189 
5190 #undef __FUNCT__
5191 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5192 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5193 {
5194     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5195     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5196     PetscInt          i,k,nz,idx,idt,jdx;
5197     PetscErrorCode    ierr;
5198     const MatScalar   *aa=a->a,*v;
5199     PetscScalar       *x,s1,s2,x1,x2;
5200     const PetscScalar *b;
5201 
5202     PetscFunctionBegin;
5203     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5204     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5205     /* forward solve the lower triangular */
5206     idx    = 0;
5207     x[0] = b[idx]; x[1] = b[1+idx];
5208     for (i=1; i<n; i++) {
5209         v   = aa + 4*ai[i];
5210        vi   = aj + ai[i];
5211        nz   = ai[i+1] - ai[i];
5212        idx  = 2*i;
5213        s1   = b[idx];s2 = b[1+idx];
5214       for(k=0;k<nz;k++){
5215          jdx   = 2*vi[k];
5216           x1    = x[jdx];x2 = x[1+jdx];
5217           s1   -= v[0]*x1 + v[2]*x2;
5218           s2   -= v[1]*x1 + v[3]*x2;
5219            v   +=  4;
5220         }
5221        x[idx]   = s1;
5222        x[1+idx] = s2;
5223     }
5224 
5225    /* backward solve the upper triangular */
5226   for (i=n-1; i>=0; i--){
5227      v   = aa + 4*(adiag[i+1]+1);
5228      vi  = aj + adiag[i+1]+1;
5229      nz  = adiag[i] - adiag[i+1]-1;
5230      idt = 2*i;
5231      s1 = x[idt];  s2 = x[1+idt];
5232      for(k=0;k<nz;k++){
5233       idx   = 2*vi[k];
5234        x1    = x[idx];   x2 = x[1+idx];
5235        s1 -= v[0]*x1 + v[2]*x2;
5236        s2 -= v[1]*x1 + v[3]*x2;
5237          v    += 4;
5238     }
5239     /* x = inv_diagonal*x */
5240    x[idt]   = v[0]*s1 + v[2]*s2;
5241    x[1+idt] = v[1]*s1 + v[3]*s2;
5242   }
5243 
5244   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5245   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5246   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5247   PetscFunctionReturn(0);
5248 }
5249 
5250 #undef __FUNCT__
5251 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5252 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5253 {
5254   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5255   IS                iscol=a->col,isrow=a->row;
5256   PetscErrorCode    ierr;
5257   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5258   PetscInt          i,nz;
5259   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5260   const MatScalar   *aa=a->a,*v;
5261   PetscScalar       *x,s1,*t;
5262   const PetscScalar *b;
5263 
5264   PetscFunctionBegin;
5265   if (!n) PetscFunctionReturn(0);
5266 
5267   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5268   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5269   t  = a->solve_work;
5270 
5271   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5272   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5273 
5274   /* forward solve the lower triangular */
5275   t[0] = b[*r++];
5276   for (i=1; i<n; i++) {
5277     v     = aa + ai[i];
5278     vi    = aj + ai[i];
5279     nz    = diag[i] - ai[i];
5280     s1  = b[*r++];
5281     while (nz--) {
5282       s1 -= (*v++)*t[*vi++];
5283     }
5284     t[i] = s1;
5285   }
5286   /* backward solve the upper triangular */
5287   for (i=n-1; i>=0; i--){
5288     v    = aa + diag[i] + 1;
5289     vi   = aj + diag[i] + 1;
5290     nz   = ai[i+1] - diag[i] - 1;
5291     s1 = t[i];
5292     while (nz--) {
5293       s1 -= (*v++)*t[*vi++];
5294     }
5295     x[*c--] = t[i] = aa[diag[i]]*s1;
5296   }
5297 
5298   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5299   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5300   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5301   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5302   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5303   PetscFunctionReturn(0);
5304 }
5305 /*
5306       Special case where the matrix was ILU(0) factored in the natural
5307    ordering. This eliminates the need for the column and row permutation.
5308 */
5309 #undef __FUNCT__
5310 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5311 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5312 {
5313   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5314   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5315   PetscErrorCode    ierr;
5316   const MatScalar   *aa=a->a,*v;
5317   PetscScalar       *x;
5318   const PetscScalar *b;
5319   PetscScalar       s1,x1;
5320   PetscInt          jdx,idt,idx,nz,i;
5321 
5322   PetscFunctionBegin;
5323   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5324   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5325 
5326   /* forward solve the lower triangular */
5327   idx    = 0;
5328   x[0]   = b[0];
5329   for (i=1; i<n; i++) {
5330     v     =  aa      + ai[i];
5331     vi    =  aj      + ai[i];
5332     nz    =  diag[i] - ai[i];
5333     idx   +=  1;
5334     s1  =  b[idx];
5335     while (nz--) {
5336       jdx   = *vi++;
5337       x1    = x[jdx];
5338       s1 -= v[0]*x1;
5339       v    += 1;
5340     }
5341     x[idx]   = s1;
5342   }
5343   /* backward solve the upper triangular */
5344   for (i=n-1; i>=0; i--){
5345     v    = aa + diag[i] + 1;
5346     vi   = aj + diag[i] + 1;
5347     nz   = ai[i+1] - diag[i] - 1;
5348     idt  = i;
5349     s1 = x[idt];
5350     while (nz--) {
5351       idx   = *vi++;
5352       x1    = x[idx];
5353       s1 -= v[0]*x1;
5354       v    += 1;
5355     }
5356     v        = aa +  diag[i];
5357     x[idt]   = v[0]*s1;
5358   }
5359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5361   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5362   PetscFunctionReturn(0);
5363 }
5364 
5365 /* ----------------------------------------------------------------*/
5366 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5367 
5368 #undef __FUNCT__
5369 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5370 /*
5371    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5372 */
5373 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5374 {
5375   Mat             C=B;
5376   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5377   PetscErrorCode  ierr;
5378   PetscInt        i,j,k,ipvt[15];
5379   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5380   PetscInt        nz,nzL,row;
5381   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5382   const MatScalar *v,*aa=a->a;
5383   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5384   PetscInt        sol_ver;
5385 
5386   PetscFunctionBegin;
5387 
5388   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
5389 
5390   /* generate work space needed by the factorization */
5391   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5392   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5393 
5394   for (i=0; i<n; i++){
5395     /* zero rtmp */
5396     /* L part */
5397     nz    = bi[i+1] - bi[i];
5398     bjtmp = bj + bi[i];
5399     for  (j=0; j<nz; j++){
5400       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5401     }
5402 
5403     /* U part */
5404     nz = bdiag[i] - bdiag[i+1];
5405     bjtmp = bj + bdiag[i+1]+1;
5406     for  (j=0; j<nz; j++){
5407       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5408     }
5409 
5410     /* load in initial (unfactored row) */
5411     nz    = ai[i+1] - ai[i];
5412     ajtmp = aj + ai[i];
5413     v     = aa + bs2*ai[i];
5414     for (j=0; j<nz; j++) {
5415       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5416     }
5417 
5418     /* elimination */
5419     bjtmp = bj + bi[i];
5420     nzL   = bi[i+1] - bi[i];
5421     for(k=0;k < nzL;k++) {
5422       row = bjtmp[k];
5423       pc = rtmp + bs2*row;
5424       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5425       if (flg) {
5426         pv = b->a + bs2*bdiag[row];
5427 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5428 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5429 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5430         pv = b->a + bs2*(bdiag[row+1]+1);
5431         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5432         for (j=0; j<nz; j++) {
5433           vv   = rtmp + bs2*pj[j];
5434           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5435 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5436 	  pv  += bs2;
5437         }
5438         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5439       }
5440     }
5441 
5442     /* finished row so stick it into b->a */
5443     /* L part */
5444     pv   = b->a + bs2*bi[i] ;
5445     pj   = b->j + bi[i] ;
5446     nz   = bi[i+1] - bi[i];
5447     for (j=0; j<nz; j++) {
5448       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5449     }
5450 
5451     /* Mark diagonal and invert diagonal for simplier triangular solves */
5452     pv   = b->a + bs2*bdiag[i];
5453     pj   = b->j + bdiag[i];
5454     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5455     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5456     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftinblocks);CHKERRQ(ierr);
5457 
5458     /* U part */
5459     pv = b->a + bs2*(bdiag[i+1]+1);
5460     pj = b->j + bdiag[i+1]+1;
5461     nz = bdiag[i] - bdiag[i+1] - 1;
5462     for (j=0; j<nz; j++){
5463       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5464     }
5465   }
5466 
5467   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5468   if(sol_ver == 1) C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering;
5469   else if (sol_ver == 2) C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver2;
5470   else C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5471   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5472   C->assembled = PETSC_TRUE;
5473   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5474   PetscFunctionReturn(0);
5475 }
5476 
5477 #undef __FUNCT__
5478 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5479 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5480 {
5481   Mat            C=B;
5482   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5483   IS             isrow = b->row,isicol = b->icol;
5484   PetscErrorCode ierr;
5485   const PetscInt *r,*ic,*ics;
5486   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5487   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5488   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5489   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5490   MatScalar      *v_work;
5491   PetscTruth     col_identity,row_identity,both_identity;
5492 
5493   PetscFunctionBegin;
5494   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5495   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5496 
5497   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5498   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5499   ics  = ic;
5500 
5501   /* generate work space needed by dense LU factorization */
5502   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5503 
5504   for (i=0; i<n; i++){
5505     /* zero rtmp */
5506     /* L part */
5507     nz    = bi[i+1] - bi[i];
5508     bjtmp = bj + bi[i];
5509     for  (j=0; j<nz; j++){
5510       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5511     }
5512 
5513     /* U part */
5514     nz = bdiag[i] - bdiag[i+1];
5515     bjtmp = bj + bdiag[i+1]+1;
5516     for  (j=0; j<nz; j++){
5517       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5518     }
5519 
5520     /* load in initial (unfactored row) */
5521     nz    = ai[r[i]+1] - ai[r[i]];
5522     ajtmp = aj + ai[r[i]];
5523     v     = aa + bs2*ai[r[i]];
5524     for (j=0; j<nz; j++) {
5525       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5526     }
5527 
5528     /* elimination */
5529     bjtmp = bj + bi[i];
5530     nzL   = bi[i+1] - bi[i];
5531     for(k=0;k < nzL;k++) {
5532       row = bjtmp[k];
5533       pc = rtmp + bs2*row;
5534       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5535       if (flg) {
5536         pv         = b->a + bs2*bdiag[row];
5537         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5538         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5539         pv         = b->a + bs2*(bdiag[row+1]+1);
5540         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5541         for (j=0; j<nz; j++) {
5542           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5543         }
5544         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5545       }
5546     }
5547 
5548     /* finished row so stick it into b->a */
5549     /* L part */
5550     pv   = b->a + bs2*bi[i] ;
5551     pj   = b->j + bi[i] ;
5552     nz   = bi[i+1] - bi[i];
5553     for (j=0; j<nz; j++) {
5554       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5555     }
5556 
5557     /* Mark diagonal and invert diagonal for simplier triangular solves */
5558     pv  = b->a + bs2*bdiag[i];
5559     pj  = b->j + bdiag[i];
5560     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5561     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5562     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5563 
5564     /* U part */
5565     pv = b->a + bs2*(bdiag[i+1]+1);
5566     pj = b->j + bdiag[i+1]+1;
5567     nz = bdiag[i] - bdiag[i+1] - 1;
5568     for (j=0; j<nz; j++){
5569       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5570     }
5571   }
5572 
5573   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5574   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5575   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5576   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5577 
5578   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5579   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5580   both_identity = (PetscTruth) (row_identity && col_identity);
5581   if (both_identity){
5582     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5583   } else {
5584     C->ops->solve = MatSolve_SeqBAIJ_N;
5585   }
5586   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5587 
5588   C->assembled = PETSC_TRUE;
5589   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5590   PetscFunctionReturn(0);
5591 }
5592 
5593 /*
5594    ilu(0) with natural ordering under new data structure.
5595    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5596    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5597 */
5598 
5599 #undef __FUNCT__
5600 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5601 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5602 {
5603 
5604   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5605   PetscErrorCode     ierr;
5606   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5607   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5608 
5609   PetscFunctionBegin;
5610   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5611   b    = (Mat_SeqBAIJ*)(fact)->data;
5612 
5613   /* allocate matrix arrays for new data structure */
5614   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5615   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5616   b->singlemalloc = PETSC_TRUE;
5617   if (!b->diag){
5618     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5619     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5620   }
5621   bdiag = b->diag;
5622 
5623   if (n > 0) {
5624     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5625   }
5626 
5627   /* set bi and bj with new data structure */
5628   bi = b->i;
5629   bj = b->j;
5630 
5631   /* L part */
5632   bi[0] = 0;
5633   for (i=0; i<n; i++){
5634     nz = adiag[i] - ai[i];
5635     bi[i+1] = bi[i] + nz;
5636     aj = a->j + ai[i];
5637     for (j=0; j<nz; j++){
5638       *bj = aj[j]; bj++;
5639     }
5640   }
5641 
5642   /* U part */
5643   bi_temp = bi[n];
5644   bdiag[n] = bi[n]-1;
5645   for (i=n-1; i>=0; i--){
5646     nz = ai[i+1] - adiag[i] - 1;
5647     bi_temp = bi_temp + nz + 1;
5648     aj = a->j + adiag[i] + 1;
5649     for (j=0; j<nz; j++){
5650       *bj = aj[j]; bj++;
5651     }
5652     /* diag[i] */
5653     *bj = i; bj++;
5654     bdiag[i] = bi_temp - 1;
5655   }
5656   PetscFunctionReturn(0);
5657 }
5658 
5659 #undef __FUNCT__
5660 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5661 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5662 {
5663   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5664   IS                 isicol;
5665   PetscErrorCode     ierr;
5666   const PetscInt     *r,*ic;
5667   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5668   PetscInt           *bi,*cols,nnz,*cols_lvl;
5669   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5670   PetscInt           i,levels,diagonal_fill;
5671   PetscTruth         col_identity,row_identity,both_identity;
5672   PetscReal          f;
5673   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5674   PetscBT            lnkbt;
5675   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5676   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5677   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5678   PetscTruth         missing;
5679   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5680 
5681   PetscFunctionBegin;
5682   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5683   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5684   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5685 
5686   f             = info->fill;
5687   levels        = (PetscInt)info->levels;
5688   diagonal_fill = (PetscInt)info->diagonal_fill;
5689   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5690 
5691   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5692   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5693   both_identity = (PetscTruth) (row_identity && col_identity);
5694 
5695   if (!levels && both_identity) {
5696     /* special case: ilu(0) with natural ordering */
5697     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5698     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5699 
5700     fact->factor = MAT_FACTOR_ILU;
5701     (fact)->info.factor_mallocs    = 0;
5702     (fact)->info.fill_ratio_given  = info->fill;
5703     (fact)->info.fill_ratio_needed = 1.0;
5704     b                = (Mat_SeqBAIJ*)(fact)->data;
5705     b->row           = isrow;
5706     b->col           = iscol;
5707     b->icol          = isicol;
5708     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5709     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5710     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5711     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5712     PetscFunctionReturn(0);
5713   }
5714 
5715   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5716   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5717 
5718   /* get new row pointers */
5719   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5720   bi[0] = 0;
5721   /* bdiag is location of diagonal in factor */
5722   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5723   bdiag[0]  = 0;
5724 
5725   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5726 
5727   /* create a linked list for storing column indices of the active row */
5728   nlnk = n + 1;
5729   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5730 
5731   /* initial FreeSpace size is f*(ai[n]+1) */
5732   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5733   current_space = free_space;
5734   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5735   current_space_lvl = free_space_lvl;
5736 
5737   for (i=0; i<n; i++) {
5738     nzi = 0;
5739     /* copy current row into linked list */
5740     nnz  = ai[r[i]+1] - ai[r[i]];
5741     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5742     cols = aj + ai[r[i]];
5743     lnk[i] = -1; /* marker to indicate if diagonal exists */
5744     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5745     nzi += nlnk;
5746 
5747     /* make sure diagonal entry is included */
5748     if (diagonal_fill && lnk[i] == -1) {
5749       fm = n;
5750       while (lnk[fm] < i) fm = lnk[fm];
5751       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5752       lnk[fm]    = i;
5753       lnk_lvl[i] = 0;
5754       nzi++; dcount++;
5755     }
5756 
5757     /* add pivot rows into the active row */
5758     nzbd = 0;
5759     prow = lnk[n];
5760     while (prow < i) {
5761       nnz      = bdiag[prow];
5762       cols     = bj_ptr[prow] + nnz + 1;
5763       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5764       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5765       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5766       nzi += nlnk;
5767       prow = lnk[prow];
5768       nzbd++;
5769     }
5770     bdiag[i] = nzbd;
5771     bi[i+1]  = bi[i] + nzi;
5772 
5773     /* if free space is not available, make more free space */
5774     if (current_space->local_remaining<nzi) {
5775       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5776       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5777       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5778       reallocs++;
5779     }
5780 
5781     /* copy data into free_space and free_space_lvl, then initialize lnk */
5782     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5783     bj_ptr[i]    = current_space->array;
5784     bjlvl_ptr[i] = current_space_lvl->array;
5785 
5786     /* make sure the active row i has diagonal entry */
5787     if (*(bj_ptr[i]+bdiag[i]) != i) {
5788       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5789     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5790     }
5791 
5792     current_space->array           += nzi;
5793     current_space->local_used      += nzi;
5794     current_space->local_remaining -= nzi;
5795     current_space_lvl->array           += nzi;
5796     current_space_lvl->local_used      += nzi;
5797     current_space_lvl->local_remaining -= nzi;
5798   }
5799 
5800   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5801   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5802 
5803   /* destroy list of free space and other temporary arrays */
5804   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5805 
5806   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5807   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5808 
5809   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5810   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5811   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5812 
5813 #if defined(PETSC_USE_INFO)
5814   {
5815     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5816     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5817     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5818     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5819     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5820     if (diagonal_fill) {
5821       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5822     }
5823   }
5824 #endif
5825 
5826   /* put together the new matrix */
5827   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5828   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5829   b = (Mat_SeqBAIJ*)(fact)->data;
5830   b->free_a       = PETSC_TRUE;
5831   b->free_ij      = PETSC_TRUE;
5832   b->singlemalloc = PETSC_FALSE;
5833   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5834   b->j          = bj;
5835   b->i          = bi;
5836   b->diag       = bdiag;
5837   b->free_diag  = PETSC_TRUE;
5838   b->ilen       = 0;
5839   b->imax       = 0;
5840   b->row        = isrow;
5841   b->col        = iscol;
5842   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5843   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5844   b->icol       = isicol;
5845   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5846   /* In b structure:  Free imax, ilen, old a, old j.
5847      Allocate bdiag, solve_work, new a, new j */
5848   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5849   b->maxnz = b->nz = bdiag[0]+1;
5850   fact->info.factor_mallocs    = reallocs;
5851   fact->info.fill_ratio_given  = f;
5852   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5853   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5854   PetscFunctionReturn(0);
5855 }
5856 
5857 
5858 /*
5859      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5860    except that the data structure of Mat_SeqAIJ is slightly different.
5861    Not a good example of code reuse.
5862 */
5863 #undef __FUNCT__
5864 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5865 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5866 {
5867   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5868   IS             isicol;
5869   PetscErrorCode ierr;
5870   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5871   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5872   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5873   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5874   PetscTruth     col_identity,row_identity,both_identity,flg;
5875   PetscReal      f;
5876 
5877   PetscFunctionBegin;
5878   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5879   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5880 
5881   f             = info->fill;
5882   levels        = (PetscInt)info->levels;
5883   diagonal_fill = (PetscInt)info->diagonal_fill;
5884   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5885 
5886   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5887   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5888   both_identity = (PetscTruth) (row_identity && col_identity);
5889 
5890   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5891     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5892     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5893 
5894     fact->factor = MAT_FACTOR_ILU;
5895     b            = (Mat_SeqBAIJ*)fact->data;
5896     b->row       = isrow;
5897     b->col       = iscol;
5898     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5899     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5900     b->icol      = isicol;
5901     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5902     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5903     PetscFunctionReturn(0);
5904   }
5905 
5906   /* general case perform the symbolic factorization */
5907     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5908     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5909 
5910     /* get new row pointers */
5911     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5912     ainew[0] = 0;
5913     /* don't know how many column pointers are needed so estimate */
5914     jmax = (PetscInt)(f*ai[n] + 1);
5915     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5916     /* ajfill is level of fill for each fill entry */
5917     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5918     /* fill is a linked list of nonzeros in active row */
5919     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5920     /* im is level for each filled value */
5921     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5922     /* dloc is location of diagonal in factor */
5923     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5924     dloc[0]  = 0;
5925     for (prow=0; prow<n; prow++) {
5926 
5927       /* copy prow into linked list */
5928       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5929       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5930       xi         = aj + ai[r[prow]];
5931       fill[n]    = n;
5932       fill[prow] = -1; /* marker for diagonal entry */
5933       while (nz--) {
5934 	fm  = n;
5935 	idx = ic[*xi++];
5936 	do {
5937 	  m  = fm;
5938 	  fm = fill[m];
5939 	} while (fm < idx);
5940 	fill[m]   = idx;
5941 	fill[idx] = fm;
5942 	im[idx]   = 0;
5943       }
5944 
5945       /* make sure diagonal entry is included */
5946       if (diagonal_fill && fill[prow] == -1) {
5947 	fm = n;
5948 	while (fill[fm] < prow) fm = fill[fm];
5949 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5950 	fill[fm]   = prow;
5951 	im[prow]   = 0;
5952 	nzf++;
5953 	dcount++;
5954       }
5955 
5956       nzi = 0;
5957       row = fill[n];
5958       while (row < prow) {
5959 	incrlev = im[row] + 1;
5960 	nz      = dloc[row];
5961 	xi      = ajnew  + ainew[row] + nz + 1;
5962 	flev    = ajfill + ainew[row] + nz + 1;
5963 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5964 	fm      = row;
5965 	while (nnz-- > 0) {
5966 	  idx = *xi++;
5967 	  if (*flev + incrlev > levels) {
5968 	    flev++;
5969 	    continue;
5970 	  }
5971 	  do {
5972 	    m  = fm;
5973 	    fm = fill[m];
5974 	  } while (fm < idx);
5975 	  if (fm != idx) {
5976 	    im[idx]   = *flev + incrlev;
5977 	    fill[m]   = idx;
5978 	    fill[idx] = fm;
5979 	    fm        = idx;
5980 	    nzf++;
5981 	  } else {
5982 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5983 	  }
5984 	  flev++;
5985 	}
5986 	row = fill[row];
5987 	nzi++;
5988       }
5989       /* copy new filled row into permanent storage */
5990       ainew[prow+1] = ainew[prow] + nzf;
5991       if (ainew[prow+1] > jmax) {
5992 
5993 	/* estimate how much additional space we will need */
5994 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5995 	/* just double the memory each time */
5996 	PetscInt maxadd = jmax;
5997 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5998 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5999 	jmax += maxadd;
6000 
6001 	/* allocate a longer ajnew and ajfill */
6002 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6003 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6004 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
6005 	ajnew = xitmp;
6006 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6007 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6008 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
6009 	ajfill = xitmp;
6010 	reallocate++; /* count how many reallocations are needed */
6011       }
6012       xitmp       = ajnew + ainew[prow];
6013       flev        = ajfill + ainew[prow];
6014       dloc[prow]  = nzi;
6015       fm          = fill[n];
6016       while (nzf--) {
6017 	*xitmp++ = fm;
6018 	*flev++ = im[fm];
6019 	fm      = fill[fm];
6020       }
6021       /* make sure row has diagonal entry */
6022       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6023 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6024     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6025       }
6026     }
6027     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6028     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6029     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6030     ierr = PetscFree(fill);CHKERRQ(ierr);
6031     ierr = PetscFree(im);CHKERRQ(ierr);
6032 
6033 #if defined(PETSC_USE_INFO)
6034     {
6035       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6036       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6037       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6038       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6039       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6040       if (diagonal_fill) {
6041 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6042       }
6043     }
6044 #endif
6045 
6046     /* put together the new matrix */
6047     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6048     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6049     b    = (Mat_SeqBAIJ*)fact->data;
6050     b->free_a       = PETSC_TRUE;
6051     b->free_ij      = PETSC_TRUE;
6052     b->singlemalloc = PETSC_FALSE;
6053     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6054     b->j          = ajnew;
6055     b->i          = ainew;
6056     for (i=0; i<n; i++) dloc[i] += ainew[i];
6057     b->diag       = dloc;
6058     b->free_diag  = PETSC_TRUE;
6059     b->ilen       = 0;
6060     b->imax       = 0;
6061     b->row        = isrow;
6062     b->col        = iscol;
6063     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6064     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6065     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6066     b->icol       = isicol;
6067     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6068     /* In b structure:  Free imax, ilen, old a, old j.
6069        Allocate dloc, solve_work, new a, new j */
6070     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6071     b->maxnz          = b->nz = ainew[n];
6072 
6073     fact->info.factor_mallocs    = reallocate;
6074     fact->info.fill_ratio_given  = f;
6075     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6076 
6077   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6078   PetscFunctionReturn(0);
6079 }
6080 
6081 #undef __FUNCT__
6082 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6083 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6084 {
6085   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6086   /* int i,*AJ=a->j,nz=a->nz; */
6087   PetscFunctionBegin;
6088   /* Undo Column scaling */
6089 /*    while (nz--) { */
6090 /*      AJ[i] = AJ[i]/4; */
6091 /*    } */
6092   /* This should really invoke a push/pop logic, but we don't have that yet. */
6093   A->ops->setunfactored = PETSC_NULL;
6094   PetscFunctionReturn(0);
6095 }
6096 
6097 #undef __FUNCT__
6098 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6099 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6100 {
6101   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6102   PetscInt       *AJ=a->j,nz=a->nz;
6103   unsigned short *aj=(unsigned short *)AJ;
6104   PetscFunctionBegin;
6105   /* Is this really necessary? */
6106   while (nz--) {
6107     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6108   }
6109   A->ops->setunfactored = PETSC_NULL;
6110   PetscFunctionReturn(0);
6111 }
6112 
6113 
6114