xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 0b8f6341efea4801e15eba76161ba5410a8cc9ab)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode    ierr;
18   PetscInt          i,nz;
19   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20   const MatScalar   *aa=a->a,*v;
21   PetscScalar       s1,*x;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode    ierr;
64   PetscInt          i,nz,idx,idt,oidx;
65   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66   const MatScalar   *aa=a->a,*v;
67   PetscScalar       s1,s2,x1,x2,*x;
68   const PetscScalar *b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode    ierr;
123   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt          nz,idx,idt,j,i,oidx;
125   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126   const MatScalar   *aa=a->a,*v;
127   PetscScalar       s1,s2,x1,x2,*x;
128   const PetscScalar *b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode    ierr;
182   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183   PetscInt          i,nz,idx,idt,oidx;
184   const MatScalar   *aa=a->a,*v;
185   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186   const PetscScalar *b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode    ierr;
244   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt          nz,idx,idt,j,i,oidx;
246   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247   const MatScalar   *aa=a->a,*v;
248   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249   const PetscScalar *b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode    ierr;
306   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307   PetscInt          i,nz,idx,idt,oidx;
308   const MatScalar   *aa=a->a,*v;
309   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310   const PetscScalar *b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode    ierr;
371   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt          nz,idx,idt,j,i,oidx;
373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374   const MatScalar   *aa=a->a,*v;
375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376   const PetscScalar *b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode    ierr;
436   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437   PetscInt          i,nz,idx,idt,oidx;
438   const MatScalar   *aa=a->a,*v;
439   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440   const PetscScalar *b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   const MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509   const PetscScalar    *b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode    ierr;
573   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574   PetscInt          i,nz,idx,idt,oidx;
575   const MatScalar   *aa=a->a,*v;
576   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577   const PetscScalar *b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode    ierr;
647   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt          nz,idx,idt,j,i,oidx;
649   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650   const MatScalar   *aa=a->a,*v;
651   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652   const PetscScalar *b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode    ierr;
721   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722   PetscInt          i,nz,idx,idt,oidx;
723   const MatScalar   *aa=a->a,*v;
724   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725   const PetscScalar *b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode    ierr;
797   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt          nz,idx,idt,j,i,oidx;
799   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800   const MatScalar   *aa=a->a,*v;
801   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802   const PetscScalar *b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873   IS                iscol=a->col,isrow=a->row;
874   PetscErrorCode    ierr;
875   const PetscInt    *r,*c,*rout,*cout;
876   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877   PetscInt          i,nz;
878   const MatScalar   *aa=a->a,*v;
879   PetscScalar       s1,*x,*t;
880   const PetscScalar *b;
881 
882   PetscFunctionBegin;
883   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
884   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
885   t  = a->solve_work;
886 
887   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
888   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
889 
890   /* copy the b into temp work space according to permutation */
891   for (i=0; i<n; i++) {
892     t[i] = b[c[i]];
893   }
894 
895   /* forward solve the U^T */
896   for (i=0; i<n; i++) {
897 
898     v     = aa + diag[i];
899     /* multiply by the inverse of the block diagonal */
900     s1    = (*v++)*t[i];
901     vi    = aj + diag[i] + 1;
902     nz    = ai[i+1] - diag[i] - 1;
903     while (nz--) {
904       t[*vi++]  -= (*v++)*s1;
905     }
906     t[i]   = s1;
907   }
908   /* backward solve the L^T */
909   for (i=n-1; i>=0; i--){
910     v    = aa + diag[i] - 1;
911     vi   = aj + diag[i] - 1;
912     nz   = diag[i] - ai[i];
913     s1   = t[i];
914     while (nz--) {
915       t[*vi--]   -=  (*v--)*s1;
916     }
917   }
918 
919   /* copy t into x according to permutation */
920   for (i=0; i<n; i++) {
921     x[r[i]]   = t[i];
922   }
923 
924   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
925   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
926   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
934 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935 {
936   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937   IS                iscol=a->col,isrow=a->row;
938   PetscErrorCode    ierr;
939   const PetscInt    *r,*c,*rout,*cout;
940   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942   const MatScalar   *aa=a->a,*v;
943   PetscScalar       s1,s2,x1,x2,*x,*t;
944   const PetscScalar *b;
945 
946   PetscFunctionBegin;
947   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
948   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
949   t  = a->solve_work;
950 
951   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
952   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
953 
954   /* copy the b into temp work space according to permutation */
955   ii = 0;
956   for (i=0; i<n; i++) {
957     ic      = 2*c[i];
958     t[ii]   = b[ic];
959     t[ii+1] = b[ic+1];
960     ii += 2;
961   }
962 
963   /* forward solve the U^T */
964   idx = 0;
965   for (i=0; i<n; i++) {
966 
967     v     = aa + 4*diag[i];
968     /* multiply by the inverse of the block diagonal */
969     x1    = t[idx];   x2 = t[1+idx];
970     s1 = v[0]*x1  +  v[1]*x2;
971     s2 = v[2]*x1  +  v[3]*x2;
972     v += 4;
973 
974     vi    = aj + diag[i] + 1;
975     nz    = ai[i+1] - diag[i] - 1;
976     while (nz--) {
977       oidx = 2*(*vi++);
978       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980       v  += 4;
981     }
982     t[idx]   = s1;t[1+idx] = s2;
983     idx += 2;
984   }
985   /* backward solve the L^T */
986   for (i=n-1; i>=0; i--){
987     v    = aa + 4*diag[i] - 4;
988     vi   = aj + diag[i] - 1;
989     nz   = diag[i] - ai[i];
990     idt  = 2*i;
991     s1 = t[idt];  s2 = t[1+idt];
992     while (nz--) {
993       idx   = 2*(*vi--);
994       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996       v -= 4;
997     }
998   }
999 
1000   /* copy t into x according to permutation */
1001   ii = 0;
1002   for (i=0; i<n; i++) {
1003     ir      = 2*r[i];
1004     x[ir]   = t[ii];
1005     x[ir+1] = t[ii+1];
1006     ii += 2;
1007   }
1008 
1009   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1010   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1012   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1019 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020 {
1021   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1022   PetscErrorCode    ierr;
1023   IS                iscol=a->col,isrow=a->row;
1024   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025   const PetscInt    *r,*c,*rout,*cout;
1026   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028   const MatScalar   *aa=a->a,*v;
1029   PetscScalar       s1,s2,x1,x2,*x,*t;
1030   const PetscScalar *b;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1035   t = a->solve_work;
1036 
1037   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1038   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1039 
1040   /* copy b into temp work space according to permutation */
1041   for(i=0;i<n;i++){
1042     ii = bs*i; ic = bs*c[i];
1043     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044   }
1045 
1046   /* forward solve the U^T */
1047   idx = 0;
1048   for (i=0; i<n; i++) {
1049     v     = aa + bs2*diag[i];
1050     /* multiply by the inverse of the block diagonal */
1051     x1 = t[idx];   x2 = t[1+idx];
1052     s1 = v[0]*x1  +  v[1]*x2;
1053     s2 = v[2]*x1  +  v[3]*x2;
1054     v -= bs2;
1055 
1056     vi    = aj + diag[i] - 1;
1057     nz    = diag[i] - diag[i+1] - 1;
1058     for(j=0;j>-nz;j--){
1059       oidx = bs*vi[j];
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1061       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1062       v  -= bs2;
1063     }
1064     t[idx]   = s1;t[1+idx] = s2;
1065     idx += bs;
1066   }
1067   /* backward solve the L^T */
1068   for (i=n-1; i>=0; i--){
1069     v    = aa + bs2*ai[i];
1070     vi   = aj + ai[i];
1071     nz   = ai[i+1] - ai[i];
1072     idt  = bs*i;
1073     s1   = t[idt];  s2 = t[1+idt];
1074     for(j=0;j<nz;j++){
1075       idx   = bs*vi[j];
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1077       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1078       v += bs2;
1079     }
1080   }
1081 
1082   /* copy t into x according to permutation */
1083   for(i=0;i<n;i++){
1084     ii = bs*i;  ir = bs*r[i];
1085     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1086   }
1087 
1088   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1089   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1092   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1098 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101   IS                iscol=a->col,isrow=a->row;
1102   PetscErrorCode    ierr;
1103   const PetscInt    *r,*c,*rout,*cout;
1104   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106   const MatScalar   *aa=a->a,*v;
1107   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108   const PetscScalar *b;
1109 
1110   PetscFunctionBegin;
1111   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113   t  = a->solve_work;
1114 
1115   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1116   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1117 
1118   /* copy the b into temp work space according to permutation */
1119   ii = 0;
1120   for (i=0; i<n; i++) {
1121     ic      = 3*c[i];
1122     t[ii]   = b[ic];
1123     t[ii+1] = b[ic+1];
1124     t[ii+2] = b[ic+2];
1125     ii += 3;
1126   }
1127 
1128   /* forward solve the U^T */
1129   idx = 0;
1130   for (i=0; i<n; i++) {
1131 
1132     v     = aa + 9*diag[i];
1133     /* multiply by the inverse of the block diagonal */
1134     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138     v += 9;
1139 
1140     vi    = aj + diag[i] + 1;
1141     nz    = ai[i+1] - diag[i] - 1;
1142     while (nz--) {
1143       oidx = 3*(*vi++);
1144       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147       v  += 9;
1148     }
1149     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150     idx += 3;
1151   }
1152   /* backward solve the L^T */
1153   for (i=n-1; i>=0; i--){
1154     v    = aa + 9*diag[i] - 9;
1155     vi   = aj + diag[i] - 1;
1156     nz   = diag[i] - ai[i];
1157     idt  = 3*i;
1158     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159     while (nz--) {
1160       idx   = 3*(*vi--);
1161       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164       v -= 9;
1165     }
1166   }
1167 
1168   /* copy t into x according to permutation */
1169   ii = 0;
1170   for (i=0; i<n; i++) {
1171     ir      = 3*r[i];
1172     x[ir]   = t[ii];
1173     x[ir+1] = t[ii+1];
1174     x[ir+2] = t[ii+2];
1175     ii += 3;
1176   }
1177 
1178   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1179   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1188 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189 {
1190   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1191   PetscErrorCode    ierr;
1192   IS                iscol=a->col,isrow=a->row;
1193   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194   const PetscInt    *r,*c,*rout,*cout;
1195   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197   const MatScalar   *aa=a->a,*v;
1198   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199   const PetscScalar *b;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* copy b into temp work space according to permutation */
1210   for(i=0;i<n;i++){
1211     ii = bs*i; ic = bs*c[i];
1212     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213   }
1214 
1215   /* forward solve the U^T */
1216   idx = 0;
1217   for (i=0; i<n; i++) {
1218     v     = aa + bs2*diag[i];
1219     /* multiply by the inverse of the block diagonal */
1220     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1221     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1222     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1223     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1224     v -= bs2;
1225 
1226     vi    = aj + diag[i] - 1;
1227     nz    = diag[i] - diag[i+1] - 1;
1228     for(j=0;j>-nz;j--){
1229       oidx = bs*vi[j];
1230       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1231       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1232       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v  -= bs2;
1234     }
1235     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1236     idx += bs;
1237   }
1238   /* backward solve the L^T */
1239   for (i=n-1; i>=0; i--){
1240     v    = aa + bs2*ai[i];
1241     vi   = aj + ai[i];
1242     nz   = ai[i+1] - ai[i];
1243     idt  = bs*i;
1244     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1245     for(j=0;j<nz;j++){
1246       idx   = bs*vi[j];
1247       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1248       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1249       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250       v += bs2;
1251     }
1252   }
1253 
1254   /* copy t into x according to permutation */
1255   for(i=0;i<n;i++){
1256     ii = bs*i;  ir = bs*r[i];
1257     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1270 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273   IS                iscol=a->col,isrow=a->row;
1274   PetscErrorCode    ierr;
1275   const PetscInt    *r,*c,*rout,*cout;
1276   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278   const MatScalar   *aa=a->a,*v;
1279   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280   const PetscScalar *b;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1285   t  = a->solve_work;
1286 
1287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1289 
1290   /* copy the b into temp work space according to permutation */
1291   ii = 0;
1292   for (i=0; i<n; i++) {
1293     ic      = 4*c[i];
1294     t[ii]   = b[ic];
1295     t[ii+1] = b[ic+1];
1296     t[ii+2] = b[ic+2];
1297     t[ii+3] = b[ic+3];
1298     ii += 4;
1299   }
1300 
1301   /* forward solve the U^T */
1302   idx = 0;
1303   for (i=0; i<n; i++) {
1304 
1305     v     = aa + 16*diag[i];
1306     /* multiply by the inverse of the block diagonal */
1307     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312     v += 16;
1313 
1314     vi    = aj + diag[i] + 1;
1315     nz    = ai[i+1] - diag[i] - 1;
1316     while (nz--) {
1317       oidx = 4*(*vi++);
1318       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322       v  += 16;
1323     }
1324     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325     idx += 4;
1326   }
1327   /* backward solve the L^T */
1328   for (i=n-1; i>=0; i--){
1329     v    = aa + 16*diag[i] - 16;
1330     vi   = aj + diag[i] - 1;
1331     nz   = diag[i] - ai[i];
1332     idt  = 4*i;
1333     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334     while (nz--) {
1335       idx   = 4*(*vi--);
1336       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340       v -= 16;
1341     }
1342   }
1343 
1344   /* copy t into x according to permutation */
1345   ii = 0;
1346   for (i=0; i<n; i++) {
1347     ir      = 4*r[i];
1348     x[ir]   = t[ii];
1349     x[ir+1] = t[ii+1];
1350     x[ir+2] = t[ii+2];
1351     x[ir+3] = t[ii+3];
1352     ii += 4;
1353   }
1354 
1355   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1356   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1359   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 #undef __FUNCT__
1364 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1365 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366 {
1367   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1368   PetscErrorCode    ierr;
1369   IS                iscol=a->col,isrow=a->row;
1370   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371   const PetscInt    *r,*c,*rout,*cout;
1372   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376   const PetscScalar *b;
1377 
1378   PetscFunctionBegin;
1379   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1381   t = a->solve_work;
1382 
1383   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1384   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1385 
1386   /* copy b into temp work space according to permutation */
1387   for(i=0;i<n;i++){
1388     ii = bs*i; ic = bs*c[i];
1389     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390   }
1391 
1392   /* forward solve the U^T */
1393   idx = 0;
1394   for (i=0; i<n; i++) {
1395     v     = aa + bs2*diag[i];
1396     /* multiply by the inverse of the block diagonal */
1397     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1398     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1399     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1400     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1401     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402     v -= bs2;
1403 
1404     vi    = aj + diag[i] - 1;
1405     nz    = diag[i] - diag[i+1] - 1;
1406     for(j=0;j>-nz;j--){
1407       oidx = bs*vi[j];
1408       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1409       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1410       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412       v  -= bs2;
1413     }
1414     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1415     idx += bs;
1416   }
1417   /* backward solve the L^T */
1418   for (i=n-1; i>=0; i--){
1419     v    = aa + bs2*ai[i];
1420     vi   = aj + ai[i];
1421     nz   = ai[i+1] - ai[i];
1422     idt  = bs*i;
1423     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1424     for(j=0;j<nz;j++){
1425       idx   = bs*vi[j];
1426       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1427       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1428       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1429       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1430       v += bs2;
1431     }
1432   }
1433 
1434   /* copy t into x according to permutation */
1435   for(i=0;i<n;i++){
1436     ii = bs*i;  ir = bs*r[i];
1437     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1450 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453   IS                iscol=a->col,isrow=a->row;
1454   PetscErrorCode    ierr;
1455   const PetscInt    *r,*c,*rout,*cout;
1456   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458   const MatScalar   *aa=a->a,*v;
1459   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460   const PetscScalar *b;
1461 
1462   PetscFunctionBegin;
1463   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1465   t  = a->solve_work;
1466 
1467   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1468   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1469 
1470   /* copy the b into temp work space according to permutation */
1471   ii = 0;
1472   for (i=0; i<n; i++) {
1473     ic      = 5*c[i];
1474     t[ii]   = b[ic];
1475     t[ii+1] = b[ic+1];
1476     t[ii+2] = b[ic+2];
1477     t[ii+3] = b[ic+3];
1478     t[ii+4] = b[ic+4];
1479     ii += 5;
1480   }
1481 
1482   /* forward solve the U^T */
1483   idx = 0;
1484   for (i=0; i<n; i++) {
1485 
1486     v     = aa + 25*diag[i];
1487     /* multiply by the inverse of the block diagonal */
1488     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494     v += 25;
1495 
1496     vi    = aj + diag[i] + 1;
1497     nz    = ai[i+1] - diag[i] - 1;
1498     while (nz--) {
1499       oidx = 5*(*vi++);
1500       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505       v  += 25;
1506     }
1507     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508     idx += 5;
1509   }
1510   /* backward solve the L^T */
1511   for (i=n-1; i>=0; i--){
1512     v    = aa + 25*diag[i] - 25;
1513     vi   = aj + diag[i] - 1;
1514     nz   = diag[i] - ai[i];
1515     idt  = 5*i;
1516     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517     while (nz--) {
1518       idx   = 5*(*vi--);
1519       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524       v -= 25;
1525     }
1526   }
1527 
1528   /* copy t into x according to permutation */
1529   ii = 0;
1530   for (i=0; i<n; i++) {
1531     ir      = 5*r[i];
1532     x[ir]   = t[ii];
1533     x[ir+1] = t[ii+1];
1534     x[ir+2] = t[ii+2];
1535     x[ir+3] = t[ii+3];
1536     x[ir+4] = t[ii+4];
1537     ii += 5;
1538   }
1539 
1540   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1541   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1550 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553   PetscErrorCode    ierr;
1554   IS                iscol=a->col,isrow=a->row;
1555   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556   const PetscInt    *r,*c,*rout,*cout;
1557   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559   const MatScalar   *aa=a->a,*v;
1560   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561   const PetscScalar *b;
1562 
1563   PetscFunctionBegin;
1564   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1566   t = a->solve_work;
1567 
1568   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1569   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1570 
1571   /* copy b into temp work space according to permutation */
1572   for(i=0;i<n;i++){
1573     ii = bs*i; ic = bs*c[i];
1574     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575     t[ii+4] = b[ic+4];
1576   }
1577 
1578   /* forward solve the U^T */
1579   idx = 0;
1580   for (i=0; i<n; i++) {
1581     v     = aa + bs2*diag[i];
1582     /* multiply by the inverse of the block diagonal */
1583     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1585     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1586     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589     v -= bs2;
1590 
1591     vi    = aj + diag[i] - 1;
1592     nz    = diag[i] - diag[i+1] - 1;
1593     for(j=0;j>-nz;j--){
1594       oidx = bs*vi[j];
1595       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1596       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1597       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600       v  -= bs2;
1601     }
1602     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1603     idx += bs;
1604   }
1605   /* backward solve the L^T */
1606   for (i=n-1; i>=0; i--){
1607     v    = aa + bs2*ai[i];
1608     vi   = aj + ai[i];
1609     nz   = ai[i+1] - ai[i];
1610     idt  = bs*i;
1611     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1612     for(j=0;j<nz;j++){
1613       idx   = bs*vi[j];
1614       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1615       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1616       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619       v += bs2;
1620     }
1621   }
1622 
1623   /* copy t into x according to permutation */
1624   for(i=0;i<n;i++){
1625     ii = bs*i;  ir = bs*r[i];
1626     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1627     x[ir+4] = t[ii+4];
1628   }
1629 
1630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1634   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1640 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641 {
1642   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643   IS                iscol=a->col,isrow=a->row;
1644   PetscErrorCode    ierr;
1645   const PetscInt    *r,*c,*rout,*cout;
1646   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648   const MatScalar   *aa=a->a,*v;
1649   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650   const PetscScalar *b;
1651 
1652   PetscFunctionBegin;
1653   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655   t  = a->solve_work;
1656 
1657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1659 
1660   /* copy the b into temp work space according to permutation */
1661   ii = 0;
1662   for (i=0; i<n; i++) {
1663     ic      = 6*c[i];
1664     t[ii]   = b[ic];
1665     t[ii+1] = b[ic+1];
1666     t[ii+2] = b[ic+2];
1667     t[ii+3] = b[ic+3];
1668     t[ii+4] = b[ic+4];
1669     t[ii+5] = b[ic+5];
1670     ii += 6;
1671   }
1672 
1673   /* forward solve the U^T */
1674   idx = 0;
1675   for (i=0; i<n; i++) {
1676 
1677     v     = aa + 36*diag[i];
1678     /* multiply by the inverse of the block diagonal */
1679     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680     x6    = t[5+idx];
1681     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687     v += 36;
1688 
1689     vi    = aj + diag[i] + 1;
1690     nz    = ai[i+1] - diag[i] - 1;
1691     while (nz--) {
1692       oidx = 6*(*vi++);
1693       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699       v  += 36;
1700     }
1701     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702     t[5+idx] = s6;
1703     idx += 6;
1704   }
1705   /* backward solve the L^T */
1706   for (i=n-1; i>=0; i--){
1707     v    = aa + 36*diag[i] - 36;
1708     vi   = aj + diag[i] - 1;
1709     nz   = diag[i] - ai[i];
1710     idt  = 6*i;
1711     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712     s6 = t[5+idt];
1713     while (nz--) {
1714       idx   = 6*(*vi--);
1715       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721       v -= 36;
1722     }
1723   }
1724 
1725   /* copy t into x according to permutation */
1726   ii = 0;
1727   for (i=0; i<n; i++) {
1728     ir      = 6*r[i];
1729     x[ir]   = t[ii];
1730     x[ir+1] = t[ii+1];
1731     x[ir+2] = t[ii+2];
1732     x[ir+3] = t[ii+3];
1733     x[ir+4] = t[ii+4];
1734     x[ir+5] = t[ii+5];
1735     ii += 6;
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749 {
1750   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751   PetscErrorCode    ierr;
1752   IS                iscol=a->col,isrow=a->row;
1753   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754   const PetscInt    *r,*c,*rout,*cout;
1755   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759   const PetscScalar *b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t = a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768 
1769   /* copy b into temp work space according to permutation */
1770   for(i=0;i<n;i++){
1771     ii = bs*i; ic = bs*c[i];
1772     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1774   }
1775 
1776   /* forward solve the U^T */
1777   idx = 0;
1778   for (i=0; i<n; i++) {
1779     v     = aa + bs2*diag[i];
1780     /* multiply by the inverse of the block diagonal */
1781     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782     x6    = t[5+idx];
1783     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1784     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1785     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789     v -= bs2;
1790 
1791     vi    = aj + diag[i] - 1;
1792     nz    = diag[i] - diag[i+1] - 1;
1793     for(j=0;j>-nz;j--){
1794       oidx = bs*vi[j];
1795       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1796       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1797       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801       v  -= bs2;
1802     }
1803     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1804     t[5+idx] = s6;
1805     idx += bs;
1806   }
1807   /* backward solve the L^T */
1808   for (i=n-1; i>=0; i--){
1809     v    = aa + bs2*ai[i];
1810     vi   = aj + ai[i];
1811     nz   = ai[i+1] - ai[i];
1812     idt  = bs*i;
1813     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1814     s6   = t[5+idt];
1815    for(j=0;j<nz;j++){
1816       idx   = bs*vi[j];
1817       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1818       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1819       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823       v += bs2;
1824     }
1825   }
1826 
1827   /* copy t into x according to permutation */
1828   for(i=0;i<n;i++){
1829     ii = bs*i;  ir = bs*r[i];
1830     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1831     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1832   }
1833 
1834   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1838   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1839   PetscFunctionReturn(0);
1840 }
1841 
1842 #undef __FUNCT__
1843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845 {
1846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847   IS                iscol=a->col,isrow=a->row;
1848   PetscErrorCode    ierr;
1849   const PetscInt    *r,*c,*rout,*cout;
1850   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854   const PetscScalar *b;
1855 
1856   PetscFunctionBegin;
1857   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859   t  = a->solve_work;
1860 
1861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1863 
1864   /* copy the b into temp work space according to permutation */
1865   ii = 0;
1866   for (i=0; i<n; i++) {
1867     ic      = 7*c[i];
1868     t[ii]   = b[ic];
1869     t[ii+1] = b[ic+1];
1870     t[ii+2] = b[ic+2];
1871     t[ii+3] = b[ic+3];
1872     t[ii+4] = b[ic+4];
1873     t[ii+5] = b[ic+5];
1874     t[ii+6] = b[ic+6];
1875     ii += 7;
1876   }
1877 
1878   /* forward solve the U^T */
1879   idx = 0;
1880   for (i=0; i<n; i++) {
1881 
1882     v     = aa + 49*diag[i];
1883     /* multiply by the inverse of the block diagonal */
1884     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885     x6    = t[5+idx]; x7 = t[6+idx];
1886     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893     v += 49;
1894 
1895     vi    = aj + diag[i] + 1;
1896     nz    = ai[i+1] - diag[i] - 1;
1897     while (nz--) {
1898       oidx = 7*(*vi++);
1899       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906       v  += 49;
1907     }
1908     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909     t[5+idx] = s6;t[6+idx] = s7;
1910     idx += 7;
1911   }
1912   /* backward solve the L^T */
1913   for (i=n-1; i>=0; i--){
1914     v    = aa + 49*diag[i] - 49;
1915     vi   = aj + diag[i] - 1;
1916     nz   = diag[i] - ai[i];
1917     idt  = 7*i;
1918     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919     s6 = t[5+idt];s7 = t[6+idt];
1920     while (nz--) {
1921       idx   = 7*(*vi--);
1922       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929       v -= 49;
1930     }
1931   }
1932 
1933   /* copy t into x according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ir      = 7*r[i];
1937     x[ir]   = t[ii];
1938     x[ir+1] = t[ii+1];
1939     x[ir+2] = t[ii+2];
1940     x[ir+3] = t[ii+3];
1941     x[ir+4] = t[ii+4];
1942     x[ir+5] = t[ii+5];
1943     x[ir+6] = t[ii+6];
1944     ii += 7;
1945   }
1946 
1947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 #undef __FUNCT__
1955 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1956 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957 {
1958   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1959   PetscErrorCode    ierr;
1960   IS                iscol=a->col,isrow=a->row;
1961   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962   const PetscInt    *r,*c,*rout,*cout;
1963   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965   const MatScalar   *aa=a->a,*v;
1966   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967   const PetscScalar *b;
1968 
1969   PetscFunctionBegin;
1970   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1971   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1972   t = a->solve_work;
1973 
1974   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1975   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1976 
1977   /* copy b into temp work space according to permutation */
1978   for(i=0;i<n;i++){
1979     ii = bs*i; ic = bs*c[i];
1980     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1982   }
1983 
1984   /* forward solve the U^T */
1985   idx = 0;
1986   for (i=0; i<n; i++) {
1987     v     = aa + bs2*diag[i];
1988     /* multiply by the inverse of the block diagonal */
1989     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990     x6    = t[5+idx]; x7 = t[6+idx];
1991     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1992     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998     v -= bs2;
1999 
2000     vi    = aj + diag[i] - 1;
2001     nz    = diag[i] - diag[i+1] - 1;
2002     for(j=0;j>-nz;j--){
2003       oidx = bs*vi[j];
2004       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2005       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011       v  -= bs2;
2012     }
2013     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2014     t[5+idx] = s6;  t[6+idx] = s7;
2015     idx += bs;
2016   }
2017   /* backward solve the L^T */
2018   for (i=n-1; i>=0; i--){
2019     v    = aa + bs2*ai[i];
2020     vi   = aj + ai[i];
2021     nz   = ai[i+1] - ai[i];
2022     idt  = bs*i;
2023     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2024     s6   = t[5+idt];  s7 = t[6+idt];
2025    for(j=0;j<nz;j++){
2026       idx   = bs*vi[j];
2027       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2028       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034       v += bs2;
2035     }
2036   }
2037 
2038   /* copy t into x according to permutation */
2039   for(i=0;i<n;i++){
2040     ii = bs*i;  ir = bs*r[i];
2041     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2042     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2043   }
2044 
2045   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2046   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2049   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /* ----------------------------------------------------------- */
2054 #undef __FUNCT__
2055 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2056 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057 {
2058   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2059   IS                iscol=a->col,isrow=a->row;
2060   PetscErrorCode    ierr;
2061   const PetscInt    *r,*c,*rout,*cout;
2062   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063   PetscInt          i,nz;
2064   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065   const MatScalar   *aa=a->a,*v;
2066   PetscScalar       *x,*s,*t,*ls;
2067   const PetscScalar *b;
2068 
2069   PetscFunctionBegin;
2070   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2071   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2072   t  = a->solve_work;
2073 
2074   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2075   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2076 
2077   /* forward solve the lower triangular */
2078   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2079   for (i=1; i<n; i++) {
2080     v   = aa + bs2*ai[i];
2081     vi  = aj + ai[i];
2082     nz  = a->diag[i] - ai[i];
2083     s = t + bs*i;
2084     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2085     while (nz--) {
2086       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087       v += bs2;
2088     }
2089   }
2090   /* backward solve the upper triangular */
2091   ls = a->solve_work + A->cmap->n;
2092   for (i=n-1; i>=0; i--){
2093     v   = aa + bs2*(a->diag[i] + 1);
2094     vi  = aj + a->diag[i] + 1;
2095     nz  = ai[i+1] - a->diag[i] - 1;
2096     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2097     while (nz--) {
2098       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099       v += bs2;
2100     }
2101     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2103   }
2104 
2105   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2106   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2107   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2109   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112 
2113 /* ----------------------------------------------------------- */
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2116 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2119   IS                iscol=a->col,isrow=a->row;
2120   PetscErrorCode    ierr;
2121   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122   PetscInt          i,nz,j;
2123   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124   const MatScalar   *aa=a->a,*v;
2125   PetscScalar       *x,*t,*ls;
2126   const PetscScalar *b;
2127   PetscFunctionBegin;
2128   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2130   t    = a->solve_work;
2131 
2132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2134 
2135   /* copy the b into temp work space according to permutation */
2136   for (i=0; i<n; i++) {
2137     for (j=0; j<bs; j++) {
2138       t[i*bs+j] = b[c[i]*bs+j];
2139     }
2140   }
2141 
2142 
2143   /* forward solve the upper triangular transpose */
2144   ls = a->solve_work + A->cmap->n;
2145   for (i=0; i<n; i++){
2146     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2147     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148     v   = aa + bs2*(a->diag[i] + 1);
2149     vi  = aj + a->diag[i] + 1;
2150     nz  = ai[i+1] - a->diag[i] - 1;
2151     while (nz--) {
2152       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153       v += bs2;
2154     }
2155   }
2156 
2157   /* backward solve the lower triangular transpose */
2158   for (i=n-1; i>=0; i--) {
2159     v   = aa + bs2*ai[i];
2160     vi  = aj + ai[i];
2161     nz  = a->diag[i] - ai[i];
2162     while (nz--) {
2163       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164       v += bs2;
2165     }
2166   }
2167 
2168   /* copy t into x according to permutation */
2169   for (i=0; i<n; i++) {
2170     for (j=0; j<bs; j++) {
2171       x[bs*r[i]+j]   = t[bs*i+j];
2172     }
2173   }
2174 
2175   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2176   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2178   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2179   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout;
2191   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192   PetscInt          i,j,nz;
2193   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2194   const MatScalar   *aa=a->a,*v;
2195   PetscScalar       *x,*t,*ls;
2196   const PetscScalar *b;
2197 
2198   PetscFunctionBegin;
2199   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2200   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2201   t    = a->solve_work;
2202 
2203   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2204   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2205 
2206   /* copy the b into temp work space according to permutation */
2207   for (i=0; i<n; i++) {
2208     for (j=0; j<bs; j++) {
2209       t[i*bs+j] = b[c[i]*bs+j];
2210     }
2211   }
2212 
2213 
2214   /* forward solve the upper triangular transpose */
2215   ls = a->solve_work + A->cmap->n;
2216   for (i=0; i<n; i++){
2217     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2218     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219     v   = aa + bs2*(diag[i] - 1);
2220     vi  = aj + diag[i] - 1;
2221     nz  = diag[i] - diag[i+1] - 1;
2222     for(j=0;j>-nz;j--){
2223       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224       v -= bs2;
2225     }
2226   }
2227 
2228   /* backward solve the lower triangular transpose */
2229   for (i=n-1; i>=0; i--) {
2230     v   = aa + bs2*ai[i];
2231     vi  = aj + ai[i];
2232     nz  = ai[i+1] - ai[i];
2233     for(j=0;j<nz;j++){
2234       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235       v += bs2;
2236     }
2237   }
2238 
2239   /* copy t into x according to permutation */
2240   for (i=0; i<n; i++) {
2241     for (j=0; j<bs; j++) {
2242       x[bs*r[i]+j]   = t[bs*i+j];
2243     }
2244   }
2245 
2246   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2247   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2250   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /* bs = 15 for PFLOTRAN */
2255 
2256 #undef __FUNCT__
2257 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering"
2258 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx)
2259 {
2260   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2261   PetscErrorCode    ierr;
2262   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263   PetscInt          i,nz,idx,idt,idc,m;
2264   const MatScalar   *aa=a->a,*v;
2265   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267   PetscScalar       *x,*t;
2268   const PetscScalar *b;
2269 
2270   PetscFunctionBegin;
2271   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2273   t  = a->solve_work;
2274 
2275   /* forward solve the lower triangular */
2276   idx    = 0;
2277   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2278   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2279   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2280 
2281   for (i=1; i<n; i++) {
2282     v     = aa + bs2*ai[i];
2283     vi    = aj + ai[i];
2284     nz    = ai[i+1] - ai[i];
2285     idx   = bs*i;
2286     s1   = b[idx];    s2  = b[1+idx];  s3  = b[2+idx];  s4  = b[3+idx];  s5  = b[4+idx];
2287     s6   = b[5+idx];  s7  = b[6+idx];  s8  = b[7+idx];  s9  = b[8+idx];  s10 = b[9+idx];
2288     s11  = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx];
2289     for(m=0;m<nz;m++){
2290       idx   = bs*vi[m];
2291       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2292       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2293       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2294 
2295 
2296       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2297       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2298       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2299       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2300       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2301       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2302       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2303       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2304       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2305       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2306       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2307       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2308       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2309       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2310       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2311 
2312       v += bs2;
2313     }
2314     idx = bs*i;
2315     t[idx]    = s1;  t[1+idx]  = s2;  t[2+idx]  = s3;  t[3+idx]  = s4;  t[4+idx]  = s5;
2316     t[5+idx]  = s6;  t[6+idx]  = s7;  t[7+idx]  = s8;  t[8+idx]  = s9;  t[9+idx]  = s10;
2317     t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15;
2318 
2319   }
2320   /* backward solve the upper triangular */
2321   for (i=n-1; i>=0; i--){
2322     v    = aa + bs2*(adiag[i+1]+1);
2323     vi   = aj + adiag[i+1]+1;
2324     nz   = adiag[i] - adiag[i+1] - 1;
2325     idt  = bs*i;
2326     s1   = t[idt];     s2  = t[1+idt];  s3  = t[2+idt];  s4  = t[3+idt];  s5  = t[4+idt];
2327     s6   = t[5+idt];   s7  = t[6+idt];  s8  = t[7+idt];  s9  = t[8+idt];  s10 = t[9+idt];
2328     s11  = t[10+idt]; s12  = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt];
2329 
2330     for(m=0;m<nz;m++){
2331       idx   = bs*vi[m];
2332       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2333       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2334       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2335 
2336       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2337       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2338       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2339       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2340       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2341       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2342       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2343       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2344       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2345       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2346       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2347       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2348       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2349       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2350       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2351 
2352       v += bs2;
2353     }
2354     idc = bs*i;
2355 
2356     x[idc]    = t[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2357     x[1+idc]  = t[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2358     x[2+idc]  = t[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2359     x[3+idc]  = t[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2360     x[4+idc]  = t[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2361     x[5+idc]  = t[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2362     x[6+idc]  = t[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2363     x[7+idc]  = t[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2364     x[8+idc]  = t[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2365     x[9+idc]  = t[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2366     x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2367     x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2368     x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2369     x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2370     x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2371 
2372   }
2373 
2374   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2375   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2376   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2377   PetscFunctionReturn(0);
2378 }
2379 
2380 #undef __FUNCT__
2381 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2382 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2383 {
2384   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2385   PetscErrorCode    ierr;
2386   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2387   PetscInt          i,k,nz,idx,idt,idc,m;
2388   const MatScalar   *aa=a->a,*v;
2389   PetscScalar       s[15];
2390   PetscScalar       *x,*t;
2391   const PetscScalar *b;
2392 
2393   PetscFunctionBegin;
2394   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2396   t  = a->solve_work;
2397 
2398   /* forward solve the lower triangular */
2399   idx    = 0;
2400   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2401   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2402   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2403 
2404   for (i=1; i<n; i++) {
2405     v     = aa + bs2*ai[i];
2406     vi    = aj + ai[i];
2407     nz    = ai[i+1] - ai[i];
2408     idx   = bs*i;
2409     s[0]   = b[idx];    s[1]  = b[1+idx];  s[2]  = b[2+idx];  s[3]  = b[3+idx];  s[4]  = b[4+idx];
2410     s[5]   = b[5+idx];  s[6]  = b[6+idx];  s[7]  = b[7+idx];  s[8]  = b[8+idx];  s[9] = b[9+idx];
2411     s[10]  = b[10+idx]; s[11] = b[11+idx]; s[12] = b[12+idx]; s[13] = b[13+idx]; s[14] = b[14+idx];
2412     for(m=0;m<nz;m++){
2413       idx   = bs*vi[m];
2414 
2415       for(k=0;k<15;k++){
2416 	s[0] -= v[0]*t[k+idx];
2417 	s[1] -= v[1]*t[k+idx];
2418 	s[2] -= v[2]*t[k+idx];
2419         s[3] -= v[3]*t[k+idx];
2420 	s[4] -= v[4]*t[k+idx];
2421 	s[5] -= v[5]*t[k+idx];
2422 	s[6] -= v[6]*t[k+idx];
2423         s[7] -= v[7]*t[k+idx];
2424 	s[8] -= v[8]*t[k+idx];
2425 	s[9] -= v[9]*t[k+idx];
2426 	s[10] -= v[10]*t[k+idx];
2427         s[11] -= v[11]*t[k+idx];
2428 	s[12] -= v[12]*t[k+idx];
2429 	s[13] -= v[13]*t[k+idx];
2430 	s[14] -= v[14]*t[k+idx];
2431 	v += 15;
2432       }
2433     }
2434     idx = bs*i;
2435     t[idx]    = s[0];  t[1+idx]  = s[1];  t[2+idx]  = s[2];  t[3+idx]  = s[3];  t[4+idx]  = s[4];
2436     t[5+idx]  = s[5];  t[6+idx]  = s[6];  t[7+idx]  = s[7];  t[8+idx]  = s[8];  t[9+idx]  = s[9];
2437     t[10+idx] = s[10]; t[11+idx] = s[11]; t[12+idx] = s[12]; t[13+idx] = s[13]; t[14+idx] = s[14];
2438 
2439   }
2440   /* backward solve the upper triangular */
2441   for (i=n-1; i>=0; i--){
2442     v    = aa + bs2*(adiag[i+1]+1);
2443     vi   = aj + adiag[i+1]+1;
2444     nz   = adiag[i] - adiag[i+1] - 1;
2445     idt  = bs*i;
2446     s[0]   = t[idt];    s[1]  = t[1+idt];  s[2]  = t[2+idt];  s[3]  = t[3+idt];  s[4]  = t[4+idt];
2447     s[5]   = t[5+idt];  s[6]  = t[6+idt];  s[7]  = t[7+idt];  s[8]  = t[8+idt];  s[9] = t[9+idt];
2448     s[10]  = t[10+idt]; s[11] = t[11+idt]; s[12] = t[12+idt]; s[13] = t[13+idt]; s[14] = t[14+idt];
2449 
2450     for(m=0;m<nz;m++){
2451       idx   = bs*vi[m];
2452       for(k=0;k<15;k++){
2453 	s[0] -= v[0]*t[k+idx];
2454 	s[1] -= v[1]*t[k+idx];
2455 	s[2] -= v[2]*t[k+idx];
2456         s[3] -= v[3]*t[k+idx];
2457 	s[4] -= v[4]*t[k+idx];
2458 	s[5] -= v[5]*t[k+idx];
2459 	s[6] -= v[6]*t[k+idx];
2460         s[7] -= v[7]*t[k+idx];
2461 	s[8] -= v[8]*t[k+idx];
2462 	s[9] -= v[9]*t[k+idx];
2463 	s[10] -= v[10]*t[k+idx];
2464         s[11] -= v[11]*t[k+idx];
2465 	s[12] -= v[12]*t[k+idx];
2466 	s[13] -= v[13]*t[k+idx];
2467 	s[14] -= v[14]*t[k+idx];
2468 	v += 15;
2469       }
2470     }
2471     idc = bs*i;
2472 
2473     for(k=0;k<15;k++){
2474       t[idt]    += v[0]*s[k];
2475       t[1+idt]  += v[1]*s[k];
2476       t[2+idt]  += v[2]*s[k];
2477       t[3+idt]  += v[3]*s[k];
2478       t[4+idt]  += v[4]*s[k];
2479       t[5+idt]  += v[5]*s[k];
2480       t[6+idt]  += v[6]*s[k];
2481       t[7+idt]  += v[7]*s[k];
2482       t[8+idt]  += v[8]*s[k];
2483       t[9+idt]  += v[9]*s[k];
2484       t[10+idt] += v[10]*s[k];
2485       t[11+idt] += v[11]*s[k];
2486       t[12+idt] += v[12]*s[k];
2487       t[13+idt] += v[13]*s[k];
2488       t[14+idt] += v[14]*s[k];
2489       v += 15;
2490      }
2491      x[idc] = t[idt]; x[1+idc] = t[1+idt]; x[2+idc] = t[2+idt]; x[3+idc] = t[3+idt]; x[4+idc] = t[4+idt];
2492      x[5+idc] = t[5+idt]; x[6+idc] = t[6+idt]; x[7+idc] = t[7+idt]; x[8+idc] = t[8+idt]; x[9+idc] = t[9+idt];
2493      x[10+idc] = t[10+idt]; x[11+idc] = t[11+idt]; x[12+idc] = t[12+idt]; x[13+idc] = t[13+idt]; x[14+idc] = t[14+idt];
2494   }
2495 
2496   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2497   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2498   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2499   PetscFunctionReturn(0);
2500 }
2501 
2502 
2503 #undef __FUNCT__
2504 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2505 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2506 {
2507   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2508   IS                iscol=a->col,isrow=a->row;
2509   PetscErrorCode    ierr;
2510   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2511   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2512   PetscInt          i,nz,idx,idt,idc;
2513   const MatScalar   *aa=a->a,*v;
2514   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2515   const PetscScalar *b;
2516 
2517   PetscFunctionBegin;
2518   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2519   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2520   t  = a->solve_work;
2521 
2522   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2523   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2524 
2525   /* forward solve the lower triangular */
2526   idx    = 7*(*r++);
2527   t[0] = b[idx];   t[1] = b[1+idx];
2528   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2529   t[5] = b[5+idx]; t[6] = b[6+idx];
2530 
2531   for (i=1; i<n; i++) {
2532     v     = aa + 49*ai[i];
2533     vi    = aj + ai[i];
2534     nz    = diag[i] - ai[i];
2535     idx   = 7*(*r++);
2536     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2537     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2538     while (nz--) {
2539       idx   = 7*(*vi++);
2540       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2541       x4    = t[3+idx];x5 = t[4+idx];
2542       x6    = t[5+idx];x7 = t[6+idx];
2543       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2544       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2545       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2546       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2547       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2548       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2549       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2550       v += 49;
2551     }
2552     idx = 7*i;
2553     t[idx]   = s1;t[1+idx] = s2;
2554     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2555     t[5+idx] = s6;t[6+idx] = s7;
2556   }
2557   /* backward solve the upper triangular */
2558   for (i=n-1; i>=0; i--){
2559     v    = aa + 49*diag[i] + 49;
2560     vi   = aj + diag[i] + 1;
2561     nz   = ai[i+1] - diag[i] - 1;
2562     idt  = 7*i;
2563     s1 = t[idt];  s2 = t[1+idt];
2564     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2565     s6 = t[5+idt];s7 = t[6+idt];
2566     while (nz--) {
2567       idx   = 7*(*vi++);
2568       x1    = t[idx];   x2 = t[1+idx];
2569       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2570       x6    = t[5+idx]; x7 = t[6+idx];
2571       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2572       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2573       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2574       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2575       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2576       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2577       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2578       v += 49;
2579     }
2580     idc = 7*(*c--);
2581     v   = aa + 49*diag[i];
2582     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2583                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2584     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2585                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2586     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2587                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2588     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2589                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2590     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2591                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2592     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2593                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2594     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2595                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2596   }
2597 
2598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2600   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2602   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2603   PetscFunctionReturn(0);
2604 }
2605 
2606 #undef __FUNCT__
2607 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2608 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2609 {
2610   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2611   IS                iscol=a->col,isrow=a->row;
2612   PetscErrorCode    ierr;
2613   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2614   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2615   PetscInt          i,nz,idx,idt,idc,m;
2616   const MatScalar   *aa=a->a,*v;
2617   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2618   const PetscScalar *b;
2619 
2620   PetscFunctionBegin;
2621   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2622   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2623   t  = a->solve_work;
2624 
2625   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2626   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2627 
2628   /* forward solve the lower triangular */
2629   idx    = 7*r[0];
2630   t[0] = b[idx];   t[1] = b[1+idx];
2631   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2632   t[5] = b[5+idx]; t[6] = b[6+idx];
2633 
2634   for (i=1; i<n; i++) {
2635     v     = aa + 49*ai[i];
2636     vi    = aj + ai[i];
2637     nz    = ai[i+1] - ai[i];
2638     idx   = 7*r[i];
2639     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2640     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2641     for(m=0;m<nz;m++){
2642       idx   = 7*vi[m];
2643       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2644       x4    = t[3+idx];x5 = t[4+idx];
2645       x6    = t[5+idx];x7 = t[6+idx];
2646       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2647       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2648       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2649       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2650       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2651       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2652       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2653       v += 49;
2654     }
2655     idx = 7*i;
2656     t[idx]   = s1;t[1+idx] = s2;
2657     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2658     t[5+idx] = s6;t[6+idx] = s7;
2659   }
2660   /* backward solve the upper triangular */
2661   for (i=n-1; i>=0; i--){
2662     v    = aa + 49*(adiag[i+1]+1);
2663     vi   = aj + adiag[i+1]+1;
2664     nz   = adiag[i] - adiag[i+1] - 1;
2665     idt  = 7*i;
2666     s1 = t[idt];  s2 = t[1+idt];
2667     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2668     s6 = t[5+idt];s7 = t[6+idt];
2669     for(m=0;m<nz;m++){
2670       idx   = 7*vi[m];
2671       x1    = t[idx];   x2 = t[1+idx];
2672       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2673       x6    = t[5+idx]; x7 = t[6+idx];
2674       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2675       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2676       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2677       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2678       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2679       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2680       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2681       v += 49;
2682     }
2683     idc = 7*c[i];
2684     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2685                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2686     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2687                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2688     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2689                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2690     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2691                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2692     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2693                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2694     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2695                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2696     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2697                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2698   }
2699 
2700   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2701   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2704   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 #undef __FUNCT__
2709 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2710 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2711 {
2712   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2713   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2714   PetscErrorCode    ierr;
2715   PetscInt          i,nz,idx,idt,jdx;
2716   const MatScalar   *aa=a->a,*v;
2717   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2718   const PetscScalar *b;
2719 
2720   PetscFunctionBegin;
2721   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2722   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2723   /* forward solve the lower triangular */
2724   idx    = 0;
2725   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2726   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2727   x[6] = b[6+idx];
2728   for (i=1; i<n; i++) {
2729     v     =  aa + 49*ai[i];
2730     vi    =  aj + ai[i];
2731     nz    =  diag[i] - ai[i];
2732     idx   =  7*i;
2733     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2734     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2735     s7  =  b[6+idx];
2736     while (nz--) {
2737       jdx   = 7*(*vi++);
2738       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2739       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2740       x7    = x[6+jdx];
2741       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2742       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2743       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2744       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2745       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2746       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2747       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2748       v += 49;
2749      }
2750     x[idx]   = s1;
2751     x[1+idx] = s2;
2752     x[2+idx] = s3;
2753     x[3+idx] = s4;
2754     x[4+idx] = s5;
2755     x[5+idx] = s6;
2756     x[6+idx] = s7;
2757   }
2758   /* backward solve the upper triangular */
2759   for (i=n-1; i>=0; i--){
2760     v    = aa + 49*diag[i] + 49;
2761     vi   = aj + diag[i] + 1;
2762     nz   = ai[i+1] - diag[i] - 1;
2763     idt  = 7*i;
2764     s1 = x[idt];   s2 = x[1+idt];
2765     s3 = x[2+idt]; s4 = x[3+idt];
2766     s5 = x[4+idt]; s6 = x[5+idt];
2767     s7 = x[6+idt];
2768     while (nz--) {
2769       idx   = 7*(*vi++);
2770       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2771       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2772       x7    = x[6+idx];
2773       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2774       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2775       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2776       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2777       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2778       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2779       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2780       v += 49;
2781     }
2782     v        = aa + 49*diag[i];
2783     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2784                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2785     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2786                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2787     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2788                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2789     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2790                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2791     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2792                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2793     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2794                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2795     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2796                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2797   }
2798 
2799   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2800   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2801   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2802   PetscFunctionReturn(0);
2803 }
2804 
2805 #undef __FUNCT__
2806 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2807 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2808 {
2809     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2810     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2811     PetscErrorCode    ierr;
2812     PetscInt          i,k,nz,idx,jdx,idt;
2813     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2814     const MatScalar   *aa=a->a,*v;
2815     PetscScalar       *x;
2816     const PetscScalar *b;
2817     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2818 
2819     PetscFunctionBegin;
2820     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2821     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2822     /* forward solve the lower triangular */
2823     idx    = 0;
2824     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2825     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2826     for (i=1; i<n; i++) {
2827        v    = aa + bs2*ai[i];
2828        vi   = aj + ai[i];
2829        nz   = ai[i+1] - ai[i];
2830       idx   = bs*i;
2831        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2832        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2833        for(k=0;k<nz;k++) {
2834           jdx   = bs*vi[k];
2835           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2836 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2837           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2838           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2839           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2840 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2841           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2842 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2843 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2844           v   +=  bs2;
2845         }
2846 
2847        x[idx]   = s1;
2848        x[1+idx] = s2;
2849        x[2+idx] = s3;
2850        x[3+idx] = s4;
2851        x[4+idx] = s5;
2852        x[5+idx] = s6;
2853        x[6+idx] = s7;
2854     }
2855 
2856    /* backward solve the upper triangular */
2857   for (i=n-1; i>=0; i--){
2858     v   = aa + bs2*(adiag[i+1]+1);
2859      vi  = aj + adiag[i+1]+1;
2860      nz  = adiag[i] - adiag[i+1]-1;
2861      idt = bs*i;
2862      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2863      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2864     for(k=0;k<nz;k++) {
2865       idx   = bs*vi[k];
2866        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2867        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2868        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2869        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2870        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2871        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2872        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2873        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2874        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2875         v   +=  bs2;
2876     }
2877     /* x = inv_diagonal*x */
2878     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2879     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2880     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2881     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2882     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2883     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2884     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2885   }
2886 
2887   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2888   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2889   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2890   PetscFunctionReturn(0);
2891 }
2892 
2893 #undef __FUNCT__
2894 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2895 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2896 {
2897   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2898   IS                iscol=a->col,isrow=a->row;
2899   PetscErrorCode    ierr;
2900   const PetscInt    *r,*c,*rout,*cout;
2901   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2902   PetscInt          i,nz,idx,idt,idc;
2903   const MatScalar   *aa=a->a,*v;
2904   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2905   const PetscScalar *b;
2906 
2907   PetscFunctionBegin;
2908   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2909   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2910   t  = a->solve_work;
2911 
2912   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2913   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2914 
2915   /* forward solve the lower triangular */
2916   idx    = 6*(*r++);
2917   t[0] = b[idx];   t[1] = b[1+idx];
2918   t[2] = b[2+idx]; t[3] = b[3+idx];
2919   t[4] = b[4+idx]; t[5] = b[5+idx];
2920   for (i=1; i<n; i++) {
2921     v     = aa + 36*ai[i];
2922     vi    = aj + ai[i];
2923     nz    = diag[i] - ai[i];
2924     idx   = 6*(*r++);
2925     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2926     s5  = b[4+idx]; s6 = b[5+idx];
2927     while (nz--) {
2928       idx   = 6*(*vi++);
2929       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2930       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2931       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2932       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2933       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2934       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2935       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2936       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2937       v += 36;
2938     }
2939     idx = 6*i;
2940     t[idx]   = s1;t[1+idx] = s2;
2941     t[2+idx] = s3;t[3+idx] = s4;
2942     t[4+idx] = s5;t[5+idx] = s6;
2943   }
2944   /* backward solve the upper triangular */
2945   for (i=n-1; i>=0; i--){
2946     v    = aa + 36*diag[i] + 36;
2947     vi   = aj + diag[i] + 1;
2948     nz   = ai[i+1] - diag[i] - 1;
2949     idt  = 6*i;
2950     s1 = t[idt];  s2 = t[1+idt];
2951     s3 = t[2+idt];s4 = t[3+idt];
2952     s5 = t[4+idt];s6 = t[5+idt];
2953     while (nz--) {
2954       idx   = 6*(*vi++);
2955       x1    = t[idx];   x2 = t[1+idx];
2956       x3    = t[2+idx]; x4 = t[3+idx];
2957       x5    = t[4+idx]; x6 = t[5+idx];
2958       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2959       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2960       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2961       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2962       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2963       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2964       v += 36;
2965     }
2966     idc = 6*(*c--);
2967     v   = aa + 36*diag[i];
2968     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2969                                  v[18]*s4+v[24]*s5+v[30]*s6;
2970     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2971                                  v[19]*s4+v[25]*s5+v[31]*s6;
2972     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2973                                  v[20]*s4+v[26]*s5+v[32]*s6;
2974     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2975                                  v[21]*s4+v[27]*s5+v[33]*s6;
2976     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2977                                  v[22]*s4+v[28]*s5+v[34]*s6;
2978     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2979                                  v[23]*s4+v[29]*s5+v[35]*s6;
2980   }
2981 
2982   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2983   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2984   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2985   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2986   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2987   PetscFunctionReturn(0);
2988 }
2989 
2990 #undef __FUNCT__
2991 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2992 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2993 {
2994   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2995   IS                iscol=a->col,isrow=a->row;
2996   PetscErrorCode    ierr;
2997   const PetscInt    *r,*c,*rout,*cout;
2998   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2999   PetscInt          i,nz,idx,idt,idc,m;
3000   const MatScalar   *aa=a->a,*v;
3001   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
3002   const PetscScalar *b;
3003 
3004   PetscFunctionBegin;
3005   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3006   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3007   t  = a->solve_work;
3008 
3009   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3010   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3011 
3012   /* forward solve the lower triangular */
3013   idx    = 6*r[0];
3014   t[0] = b[idx];   t[1] = b[1+idx];
3015   t[2] = b[2+idx]; t[3] = b[3+idx];
3016   t[4] = b[4+idx]; t[5] = b[5+idx];
3017   for (i=1; i<n; i++) {
3018     v     = aa + 36*ai[i];
3019     vi    = aj + ai[i];
3020     nz    = ai[i+1] - ai[i];
3021     idx   = 6*r[i];
3022     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3023     s5  = b[4+idx]; s6 = b[5+idx];
3024     for(m=0;m<nz;m++){
3025       idx   = 6*vi[m];
3026       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3027       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3028       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3029       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3030       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3031       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3032       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3033       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3034       v += 36;
3035     }
3036     idx = 6*i;
3037     t[idx]   = s1;t[1+idx] = s2;
3038     t[2+idx] = s3;t[3+idx] = s4;
3039     t[4+idx] = s5;t[5+idx] = s6;
3040   }
3041   /* backward solve the upper triangular */
3042   for (i=n-1; i>=0; i--){
3043     v    = aa + 36*(adiag[i+1]+1);
3044     vi   = aj + adiag[i+1]+1;
3045     nz   = adiag[i] - adiag[i+1] - 1;
3046     idt  = 6*i;
3047     s1 = t[idt];  s2 = t[1+idt];
3048     s3 = t[2+idt];s4 = t[3+idt];
3049     s5 = t[4+idt];s6 = t[5+idt];
3050     for(m=0;m<nz;m++){
3051       idx   = 6*vi[m];
3052       x1    = t[idx];   x2 = t[1+idx];
3053       x3    = t[2+idx]; x4 = t[3+idx];
3054       x5    = t[4+idx]; x6 = t[5+idx];
3055       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3056       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3057       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3058       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3059       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3060       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3061       v += 36;
3062     }
3063     idc = 6*c[i];
3064     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3065                                  v[18]*s4+v[24]*s5+v[30]*s6;
3066     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3067                                  v[19]*s4+v[25]*s5+v[31]*s6;
3068     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3069                                  v[20]*s4+v[26]*s5+v[32]*s6;
3070     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3071                                  v[21]*s4+v[27]*s5+v[33]*s6;
3072     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3073                                  v[22]*s4+v[28]*s5+v[34]*s6;
3074     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3075                                  v[23]*s4+v[29]*s5+v[35]*s6;
3076   }
3077 
3078   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3079   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3080   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3081   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3082   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 #undef __FUNCT__
3087 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
3088 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3089 {
3090   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3091   PetscInt          i,nz,idx,idt,jdx;
3092   PetscErrorCode    ierr;
3093   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3094   const MatScalar   *aa=a->a,*v;
3095   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3096   const PetscScalar *b;
3097 
3098   PetscFunctionBegin;
3099   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3100   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3101   /* forward solve the lower triangular */
3102   idx    = 0;
3103   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3104   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3105   for (i=1; i<n; i++) {
3106     v     =  aa + 36*ai[i];
3107     vi    =  aj + ai[i];
3108     nz    =  diag[i] - ai[i];
3109     idx   =  6*i;
3110     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3111     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3112     while (nz--) {
3113       jdx   = 6*(*vi++);
3114       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3115       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3116       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3117       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3118       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3119       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3120       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3121       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3122       v += 36;
3123      }
3124     x[idx]   = s1;
3125     x[1+idx] = s2;
3126     x[2+idx] = s3;
3127     x[3+idx] = s4;
3128     x[4+idx] = s5;
3129     x[5+idx] = s6;
3130   }
3131   /* backward solve the upper triangular */
3132   for (i=n-1; i>=0; i--){
3133     v    = aa + 36*diag[i] + 36;
3134     vi   = aj + diag[i] + 1;
3135     nz   = ai[i+1] - diag[i] - 1;
3136     idt  = 6*i;
3137     s1 = x[idt];   s2 = x[1+idt];
3138     s3 = x[2+idt]; s4 = x[3+idt];
3139     s5 = x[4+idt]; s6 = x[5+idt];
3140     while (nz--) {
3141       idx   = 6*(*vi++);
3142       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3143       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3144       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3145       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3146       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3147       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3148       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3149       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3150       v += 36;
3151     }
3152     v        = aa + 36*diag[i];
3153     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3154     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3155     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3156     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3157     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3158     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3159   }
3160 
3161   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3162   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3163   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3164   PetscFunctionReturn(0);
3165 }
3166 
3167 #undef __FUNCT__
3168 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3169 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3170 {
3171     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3172     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3173     PetscErrorCode    ierr;
3174     PetscInt          i,k,nz,idx,jdx,idt;
3175     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3176     const MatScalar   *aa=a->a,*v;
3177     PetscScalar       *x;
3178     const PetscScalar *b;
3179     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3180 
3181     PetscFunctionBegin;
3182     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3183     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3184     /* forward solve the lower triangular */
3185     idx    = 0;
3186     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3187     x[4] = b[4+idx];x[5] = b[5+idx];
3188     for (i=1; i<n; i++) {
3189        v    = aa + bs2*ai[i];
3190        vi   = aj + ai[i];
3191        nz   = ai[i+1] - ai[i];
3192       idx   = bs*i;
3193        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3194        s5   = b[4+idx];s6 = b[5+idx];
3195        for(k=0;k<nz;k++){
3196           jdx   = bs*vi[k];
3197           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3198 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3199           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3200           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3201           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3202 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3203           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3204 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3205           v   +=  bs2;
3206         }
3207 
3208        x[idx]   = s1;
3209        x[1+idx] = s2;
3210        x[2+idx] = s3;
3211        x[3+idx] = s4;
3212        x[4+idx] = s5;
3213        x[5+idx] = s6;
3214     }
3215 
3216    /* backward solve the upper triangular */
3217   for (i=n-1; i>=0; i--){
3218     v   = aa + bs2*(adiag[i+1]+1);
3219      vi  = aj + adiag[i+1]+1;
3220      nz  = adiag[i] - adiag[i+1]-1;
3221      idt = bs*i;
3222      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3223      s5 = x[4+idt];s6 = x[5+idt];
3224      for(k=0;k<nz;k++){
3225       idx   = bs*vi[k];
3226        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3227        x5    = x[4+idx];x6 = x[5+idx];
3228        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3229        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3230        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3231        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3232        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3233        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3234         v   +=  bs2;
3235     }
3236     /* x = inv_diagonal*x */
3237    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3238    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3239    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3240    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3241    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3242    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3243   }
3244 
3245   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3247   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3248   PetscFunctionReturn(0);
3249 }
3250 
3251 #undef __FUNCT__
3252 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3253 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3254 {
3255   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3256   IS                iscol=a->col,isrow=a->row;
3257   PetscErrorCode    ierr;
3258   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3259   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3260   PetscInt          i,nz,idx,idt,idc;
3261   const MatScalar   *aa=a->a,*v;
3262   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3263   const PetscScalar *b;
3264 
3265   PetscFunctionBegin;
3266   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3267   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3268   t  = a->solve_work;
3269 
3270   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3271   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3272 
3273   /* forward solve the lower triangular */
3274   idx    = 5*(*r++);
3275   t[0] = b[idx];   t[1] = b[1+idx];
3276   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3277   for (i=1; i<n; i++) {
3278     v     = aa + 25*ai[i];
3279     vi    = aj + ai[i];
3280     nz    = diag[i] - ai[i];
3281     idx   = 5*(*r++);
3282     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3283     s5  = b[4+idx];
3284     while (nz--) {
3285       idx   = 5*(*vi++);
3286       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3287       x4    = t[3+idx];x5 = t[4+idx];
3288       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3289       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3290       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3291       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3292       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3293       v += 25;
3294     }
3295     idx = 5*i;
3296     t[idx]   = s1;t[1+idx] = s2;
3297     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3298   }
3299   /* backward solve the upper triangular */
3300   for (i=n-1; i>=0; i--){
3301     v    = aa + 25*diag[i] + 25;
3302     vi   = aj + diag[i] + 1;
3303     nz   = ai[i+1] - diag[i] - 1;
3304     idt  = 5*i;
3305     s1 = t[idt];  s2 = t[1+idt];
3306     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3307     while (nz--) {
3308       idx   = 5*(*vi++);
3309       x1    = t[idx];   x2 = t[1+idx];
3310       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3311       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3312       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3313       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3314       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3315       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3316       v += 25;
3317     }
3318     idc = 5*(*c--);
3319     v   = aa + 25*diag[i];
3320     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3321                                  v[15]*s4+v[20]*s5;
3322     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3323                                  v[16]*s4+v[21]*s5;
3324     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3325                                  v[17]*s4+v[22]*s5;
3326     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3327                                  v[18]*s4+v[23]*s5;
3328     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3329                                  v[19]*s4+v[24]*s5;
3330   }
3331 
3332   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3333   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3334   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3335   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3336   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3337   PetscFunctionReturn(0);
3338 }
3339 
3340 #undef __FUNCT__
3341 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3342 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3343 {
3344   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3345   IS                iscol=a->col,isrow=a->row;
3346   PetscErrorCode    ierr;
3347   const PetscInt    *r,*c,*rout,*cout;
3348   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3349   PetscInt          i,nz,idx,idt,idc,m;
3350   const MatScalar   *aa=a->a,*v;
3351   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3352   const PetscScalar *b;
3353 
3354   PetscFunctionBegin;
3355   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3356   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3357   t  = a->solve_work;
3358 
3359   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3360   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3361 
3362   /* forward solve the lower triangular */
3363   idx    = 5*r[0];
3364   t[0] = b[idx];   t[1] = b[1+idx];
3365   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3366   for (i=1; i<n; i++) {
3367     v     = aa + 25*ai[i];
3368     vi    = aj + ai[i];
3369     nz    = ai[i+1] - ai[i];
3370     idx   = 5*r[i];
3371     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3372     s5  = b[4+idx];
3373     for(m=0;m<nz;m++){
3374       idx   = 5*vi[m];
3375       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3376       x4    = t[3+idx];x5 = t[4+idx];
3377       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3378       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3379       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3380       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3381       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3382       v += 25;
3383     }
3384     idx = 5*i;
3385     t[idx]   = s1;t[1+idx] = s2;
3386     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3387   }
3388   /* backward solve the upper triangular */
3389   for (i=n-1; i>=0; i--){
3390     v    = aa + 25*(adiag[i+1]+1);
3391     vi   = aj + adiag[i+1]+1;
3392     nz   = adiag[i] - adiag[i+1] - 1;
3393     idt  = 5*i;
3394     s1 = t[idt];  s2 = t[1+idt];
3395     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3396     for(m=0;m<nz;m++){
3397       idx   = 5*vi[m];
3398       x1    = t[idx];   x2 = t[1+idx];
3399       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3400       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3401       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3402       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3403       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3404       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3405       v += 25;
3406     }
3407     idc = 5*c[i];
3408     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3409                                  v[15]*s4+v[20]*s5;
3410     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3411                                  v[16]*s4+v[21]*s5;
3412     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3413                                  v[17]*s4+v[22]*s5;
3414     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3415                                  v[18]*s4+v[23]*s5;
3416     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3417                                  v[19]*s4+v[24]*s5;
3418   }
3419 
3420   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3421   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3422   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3423   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3424   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 #undef __FUNCT__
3429 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3430 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3431 {
3432   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3433   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3434   PetscInt          i,nz,idx,idt,jdx;
3435   PetscErrorCode    ierr;
3436   const MatScalar   *aa=a->a,*v;
3437   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3438   const PetscScalar *b;
3439 
3440   PetscFunctionBegin;
3441   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3442   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3443   /* forward solve the lower triangular */
3444   idx    = 0;
3445   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3446   for (i=1; i<n; i++) {
3447     v     =  aa + 25*ai[i];
3448     vi    =  aj + ai[i];
3449     nz    =  diag[i] - ai[i];
3450     idx   =  5*i;
3451     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3452     while (nz--) {
3453       jdx   = 5*(*vi++);
3454       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3455       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3456       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3457       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3458       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3459       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3460       v    += 25;
3461     }
3462     x[idx]   = s1;
3463     x[1+idx] = s2;
3464     x[2+idx] = s3;
3465     x[3+idx] = s4;
3466     x[4+idx] = s5;
3467   }
3468   /* backward solve the upper triangular */
3469   for (i=n-1; i>=0; i--){
3470     v    = aa + 25*diag[i] + 25;
3471     vi   = aj + diag[i] + 1;
3472     nz   = ai[i+1] - diag[i] - 1;
3473     idt  = 5*i;
3474     s1 = x[idt];  s2 = x[1+idt];
3475     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3476     while (nz--) {
3477       idx   = 5*(*vi++);
3478       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3479       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3480       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3481       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3482       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3483       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3484       v    += 25;
3485     }
3486     v        = aa + 25*diag[i];
3487     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3488     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3489     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3490     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3491     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3492   }
3493 
3494   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3495   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3496   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3497   PetscFunctionReturn(0);
3498 }
3499 
3500 #undef __FUNCT__
3501 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3502 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3503 {
3504   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3505   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3506   PetscInt          i,k,nz,idx,idt,jdx;
3507   PetscErrorCode    ierr;
3508   const MatScalar   *aa=a->a,*v;
3509   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3510   const PetscScalar *b;
3511 
3512   PetscFunctionBegin;
3513   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3515   /* forward solve the lower triangular */
3516   idx    = 0;
3517   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3518   for (i=1; i<n; i++) {
3519     v   = aa + 25*ai[i];
3520     vi  = aj + ai[i];
3521     nz  = ai[i+1] - ai[i];
3522     idx = 5*i;
3523     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3524     for(k=0;k<nz;k++) {
3525       jdx   = 5*vi[k];
3526       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3527       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3528       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3529       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3530       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3531       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3532       v    += 25;
3533     }
3534     x[idx]   = s1;
3535     x[1+idx] = s2;
3536     x[2+idx] = s3;
3537     x[3+idx] = s4;
3538     x[4+idx] = s5;
3539   }
3540 
3541   /* backward solve the upper triangular */
3542   for (i=n-1; i>=0; i--){
3543     v   = aa + 25*(adiag[i+1]+1);
3544     vi  = aj + adiag[i+1]+1;
3545     nz  = adiag[i] - adiag[i+1]-1;
3546     idt = 5*i;
3547     s1 = x[idt];  s2 = x[1+idt];
3548     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3549     for(k=0;k<nz;k++){
3550       idx   = 5*vi[k];
3551       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3552       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3553       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3554       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3555       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3556       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3557       v    += 25;
3558     }
3559     /* x = inv_diagonal*x */
3560     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3561     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3562     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3563     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3564     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3565   }
3566 
3567   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3568   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3569   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3570   PetscFunctionReturn(0);
3571 }
3572 
3573 #undef __FUNCT__
3574 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3575 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3576 {
3577   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3578   IS                iscol=a->col,isrow=a->row;
3579   PetscErrorCode    ierr;
3580   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3581   PetscInt          i,nz,idx,idt,idc;
3582   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3583   const MatScalar   *aa=a->a,*v;
3584   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3585   const PetscScalar *b;
3586 
3587   PetscFunctionBegin;
3588   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3589   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3590   t  = a->solve_work;
3591 
3592   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3593   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3594 
3595   /* forward solve the lower triangular */
3596   idx    = 4*(*r++);
3597   t[0] = b[idx];   t[1] = b[1+idx];
3598   t[2] = b[2+idx]; t[3] = b[3+idx];
3599   for (i=1; i<n; i++) {
3600     v     = aa + 16*ai[i];
3601     vi    = aj + ai[i];
3602     nz    = diag[i] - ai[i];
3603     idx   = 4*(*r++);
3604     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3605     while (nz--) {
3606       idx   = 4*(*vi++);
3607       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3608       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3609       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3610       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3611       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3612       v    += 16;
3613     }
3614     idx        = 4*i;
3615     t[idx]   = s1;t[1+idx] = s2;
3616     t[2+idx] = s3;t[3+idx] = s4;
3617   }
3618   /* backward solve the upper triangular */
3619   for (i=n-1; i>=0; i--){
3620     v    = aa + 16*diag[i] + 16;
3621     vi   = aj + diag[i] + 1;
3622     nz   = ai[i+1] - diag[i] - 1;
3623     idt  = 4*i;
3624     s1 = t[idt];  s2 = t[1+idt];
3625     s3 = t[2+idt];s4 = t[3+idt];
3626     while (nz--) {
3627       idx   = 4*(*vi++);
3628       x1    = t[idx];   x2 = t[1+idx];
3629       x3    = t[2+idx]; x4 = t[3+idx];
3630       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3631       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3632       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3633       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3634       v += 16;
3635     }
3636     idc      = 4*(*c--);
3637     v        = aa + 16*diag[i];
3638     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3639     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3640     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3641     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3642   }
3643 
3644   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3645   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3646   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3647   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3648   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3649   PetscFunctionReturn(0);
3650 }
3651 
3652 #undef __FUNCT__
3653 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3654 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3655 {
3656   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3657   IS                iscol=a->col,isrow=a->row;
3658   PetscErrorCode    ierr;
3659   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3660   PetscInt          i,nz,idx,idt,idc,m;
3661   const PetscInt    *r,*c,*rout,*cout;
3662   const MatScalar   *aa=a->a,*v;
3663   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3664   const PetscScalar *b;
3665 
3666   PetscFunctionBegin;
3667   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3668   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3669   t  = a->solve_work;
3670 
3671   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3672   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3673 
3674   /* forward solve the lower triangular */
3675   idx    = 4*r[0];
3676   t[0] = b[idx];   t[1] = b[1+idx];
3677   t[2] = b[2+idx]; t[3] = b[3+idx];
3678   for (i=1; i<n; i++) {
3679     v     = aa + 16*ai[i];
3680     vi    = aj + ai[i];
3681     nz    = ai[i+1] - ai[i];
3682     idx   = 4*r[i];
3683     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3684     for(m=0;m<nz;m++){
3685       idx   = 4*vi[m];
3686       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3687       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3688       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3689       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3690       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3691       v    += 16;
3692     }
3693     idx        = 4*i;
3694     t[idx]   = s1;t[1+idx] = s2;
3695     t[2+idx] = s3;t[3+idx] = s4;
3696   }
3697   /* backward solve the upper triangular */
3698   for (i=n-1; i>=0; i--){
3699     v    = aa + 16*(adiag[i+1]+1);
3700     vi   = aj + adiag[i+1]+1;
3701     nz   = adiag[i] - adiag[i+1] - 1;
3702     idt  = 4*i;
3703     s1 = t[idt];  s2 = t[1+idt];
3704     s3 = t[2+idt];s4 = t[3+idt];
3705     for(m=0;m<nz;m++){
3706       idx   = 4*vi[m];
3707       x1    = t[idx];   x2 = t[1+idx];
3708       x3    = t[2+idx]; x4 = t[3+idx];
3709       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3710       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3711       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3712       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3713       v += 16;
3714     }
3715     idc      = 4*c[i];
3716     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3717     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3718     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3719     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3720   }
3721 
3722   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3723   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3724   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3726   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3727   PetscFunctionReturn(0);
3728 }
3729 
3730 #undef __FUNCT__
3731 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3732 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3733 {
3734   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3735   IS                iscol=a->col,isrow=a->row;
3736   PetscErrorCode    ierr;
3737   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3738   PetscInt          i,nz,idx,idt,idc;
3739   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3740   const MatScalar   *aa=a->a,*v;
3741   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3742   PetscScalar       *x;
3743   const PetscScalar *b;
3744 
3745   PetscFunctionBegin;
3746   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3747   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3748   t  = (MatScalar *)a->solve_work;
3749 
3750   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3751   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3752 
3753   /* forward solve the lower triangular */
3754   idx    = 4*(*r++);
3755   t[0] = (MatScalar)b[idx];
3756   t[1] = (MatScalar)b[1+idx];
3757   t[2] = (MatScalar)b[2+idx];
3758   t[3] = (MatScalar)b[3+idx];
3759   for (i=1; i<n; i++) {
3760     v     = aa + 16*ai[i];
3761     vi    = aj + ai[i];
3762     nz    = diag[i] - ai[i];
3763     idx   = 4*(*r++);
3764     s1 = (MatScalar)b[idx];
3765     s2 = (MatScalar)b[1+idx];
3766     s3 = (MatScalar)b[2+idx];
3767     s4 = (MatScalar)b[3+idx];
3768     while (nz--) {
3769       idx   = 4*(*vi++);
3770       x1  = t[idx];
3771       x2  = t[1+idx];
3772       x3  = t[2+idx];
3773       x4  = t[3+idx];
3774       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3775       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3776       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3777       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3778       v    += 16;
3779     }
3780     idx        = 4*i;
3781     t[idx]   = s1;
3782     t[1+idx] = s2;
3783     t[2+idx] = s3;
3784     t[3+idx] = s4;
3785   }
3786   /* backward solve the upper triangular */
3787   for (i=n-1; i>=0; i--){
3788     v    = aa + 16*diag[i] + 16;
3789     vi   = aj + diag[i] + 1;
3790     nz   = ai[i+1] - diag[i] - 1;
3791     idt  = 4*i;
3792     s1 = t[idt];
3793     s2 = t[1+idt];
3794     s3 = t[2+idt];
3795     s4 = t[3+idt];
3796     while (nz--) {
3797       idx   = 4*(*vi++);
3798       x1  = t[idx];
3799       x2  = t[1+idx];
3800       x3  = t[2+idx];
3801       x4  = t[3+idx];
3802       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3803       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3804       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3805       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3806       v += 16;
3807     }
3808     idc      = 4*(*c--);
3809     v        = aa + 16*diag[i];
3810     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3811     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3812     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3813     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3814     x[idc]   = (PetscScalar)t[idt];
3815     x[1+idc] = (PetscScalar)t[1+idt];
3816     x[2+idc] = (PetscScalar)t[2+idt];
3817     x[3+idc] = (PetscScalar)t[3+idt];
3818  }
3819 
3820   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3821   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3822   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3823   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3824   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3825   PetscFunctionReturn(0);
3826 }
3827 
3828 #if defined (PETSC_HAVE_SSE)
3829 
3830 #include PETSC_HAVE_SSE
3831 
3832 #undef __FUNCT__
3833 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3834 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3835 {
3836   /*
3837      Note: This code uses demotion of double
3838      to float when performing the mixed-mode computation.
3839      This may not be numerically reasonable for all applications.
3840   */
3841   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3842   IS             iscol=a->col,isrow=a->row;
3843   PetscErrorCode ierr;
3844   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3845   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3846   MatScalar      *aa=a->a,*v;
3847   PetscScalar    *x,*b,*t;
3848 
3849   /* Make space in temp stack for 16 Byte Aligned arrays */
3850   float           ssealignedspace[11],*tmps,*tmpx;
3851   unsigned long   offset;
3852 
3853   PetscFunctionBegin;
3854   SSE_SCOPE_BEGIN;
3855 
3856     offset = (unsigned long)ssealignedspace % 16;
3857     if (offset) offset = (16 - offset)/4;
3858     tmps = &ssealignedspace[offset];
3859     tmpx = &ssealignedspace[offset+4];
3860     PREFETCH_NTA(aa+16*ai[1]);
3861 
3862     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3863     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3864     t  = a->solve_work;
3865 
3866     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3867     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3868 
3869     /* forward solve the lower triangular */
3870     idx  = 4*(*r++);
3871     t[0] = b[idx];   t[1] = b[1+idx];
3872     t[2] = b[2+idx]; t[3] = b[3+idx];
3873     v    =  aa + 16*ai[1];
3874 
3875     for (i=1; i<n;) {
3876       PREFETCH_NTA(&v[8]);
3877       vi   =  aj      + ai[i];
3878       nz   =  diag[i] - ai[i];
3879       idx  =  4*(*r++);
3880 
3881       /* Demote sum from double to float */
3882       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3883       LOAD_PS(tmps,XMM7);
3884 
3885       while (nz--) {
3886         PREFETCH_NTA(&v[16]);
3887         idx = 4*(*vi++);
3888 
3889         /* Demote solution (so far) from double to float */
3890         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3891 
3892         /* 4x4 Matrix-Vector product with negative accumulation: */
3893         SSE_INLINE_BEGIN_2(tmpx,v)
3894           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3895 
3896           /* First Column */
3897           SSE_COPY_PS(XMM0,XMM6)
3898           SSE_SHUFFLE(XMM0,XMM0,0x00)
3899           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3900           SSE_SUB_PS(XMM7,XMM0)
3901 
3902           /* Second Column */
3903           SSE_COPY_PS(XMM1,XMM6)
3904           SSE_SHUFFLE(XMM1,XMM1,0x55)
3905           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3906           SSE_SUB_PS(XMM7,XMM1)
3907 
3908           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3909 
3910           /* Third Column */
3911           SSE_COPY_PS(XMM2,XMM6)
3912           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3913           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3914           SSE_SUB_PS(XMM7,XMM2)
3915 
3916           /* Fourth Column */
3917           SSE_COPY_PS(XMM3,XMM6)
3918           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3919           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3920           SSE_SUB_PS(XMM7,XMM3)
3921         SSE_INLINE_END_2
3922 
3923         v  += 16;
3924       }
3925       idx = 4*i;
3926       v   = aa + 16*ai[++i];
3927       PREFETCH_NTA(v);
3928       STORE_PS(tmps,XMM7);
3929 
3930       /* Promote result from float to double */
3931       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3932     }
3933     /* backward solve the upper triangular */
3934     idt  = 4*(n-1);
3935     ai16 = 16*diag[n-1];
3936     v    = aa + ai16 + 16;
3937     for (i=n-1; i>=0;){
3938       PREFETCH_NTA(&v[8]);
3939       vi = aj + diag[i] + 1;
3940       nz = ai[i+1] - diag[i] - 1;
3941 
3942       /* Demote accumulator from double to float */
3943       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3944       LOAD_PS(tmps,XMM7);
3945 
3946       while (nz--) {
3947         PREFETCH_NTA(&v[16]);
3948         idx = 4*(*vi++);
3949 
3950         /* Demote solution (so far) from double to float */
3951         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3952 
3953         /* 4x4 Matrix-Vector Product with negative accumulation: */
3954         SSE_INLINE_BEGIN_2(tmpx,v)
3955           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3956 
3957           /* First Column */
3958           SSE_COPY_PS(XMM0,XMM6)
3959           SSE_SHUFFLE(XMM0,XMM0,0x00)
3960           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3961           SSE_SUB_PS(XMM7,XMM0)
3962 
3963           /* Second Column */
3964           SSE_COPY_PS(XMM1,XMM6)
3965           SSE_SHUFFLE(XMM1,XMM1,0x55)
3966           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3967           SSE_SUB_PS(XMM7,XMM1)
3968 
3969           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3970 
3971           /* Third Column */
3972           SSE_COPY_PS(XMM2,XMM6)
3973           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3974           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3975           SSE_SUB_PS(XMM7,XMM2)
3976 
3977           /* Fourth Column */
3978           SSE_COPY_PS(XMM3,XMM6)
3979           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3980           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3981           SSE_SUB_PS(XMM7,XMM3)
3982         SSE_INLINE_END_2
3983         v  += 16;
3984       }
3985       v    = aa + ai16;
3986       ai16 = 16*diag[--i];
3987       PREFETCH_NTA(aa+ai16+16);
3988       /*
3989          Scale the result by the diagonal 4x4 block,
3990          which was inverted as part of the factorization
3991       */
3992       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3993         /* First Column */
3994         SSE_COPY_PS(XMM0,XMM7)
3995         SSE_SHUFFLE(XMM0,XMM0,0x00)
3996         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3997 
3998         /* Second Column */
3999         SSE_COPY_PS(XMM1,XMM7)
4000         SSE_SHUFFLE(XMM1,XMM1,0x55)
4001         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4002         SSE_ADD_PS(XMM0,XMM1)
4003 
4004         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4005 
4006         /* Third Column */
4007         SSE_COPY_PS(XMM2,XMM7)
4008         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4009         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4010         SSE_ADD_PS(XMM0,XMM2)
4011 
4012         /* Fourth Column */
4013         SSE_COPY_PS(XMM3,XMM7)
4014         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4015         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4016         SSE_ADD_PS(XMM0,XMM3)
4017 
4018         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4019       SSE_INLINE_END_3
4020 
4021       /* Promote solution from float to double */
4022       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4023 
4024       /* Apply reordering to t and stream into x.    */
4025       /* This way, x doesn't pollute the cache.      */
4026       /* Be careful with size: 2 doubles = 4 floats! */
4027       idc  = 4*(*c--);
4028       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4029         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4030         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4031         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4032         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4033         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4034         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4035       SSE_INLINE_END_2
4036       v    = aa + ai16 + 16;
4037       idt -= 4;
4038     }
4039 
4040     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4041     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4042     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4043     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4044     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4045   SSE_SCOPE_END;
4046   PetscFunctionReturn(0);
4047 }
4048 
4049 #endif
4050 
4051 
4052 /*
4053       Special case where the matrix was ILU(0) factored in the natural
4054    ordering. This eliminates the need for the column and row permutation.
4055 */
4056 #undef __FUNCT__
4057 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
4058 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4059 {
4060   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4061   PetscInt          n=a->mbs;
4062   const PetscInt    *ai=a->i,*aj=a->j;
4063   PetscErrorCode    ierr;
4064   const PetscInt    *diag = a->diag;
4065   const MatScalar   *aa=a->a;
4066   PetscScalar       *x;
4067   const PetscScalar *b;
4068 
4069   PetscFunctionBegin;
4070   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4071   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4072 
4073 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4074   {
4075     static PetscScalar w[2000]; /* very BAD need to fix */
4076     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4077   }
4078 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4079   {
4080     static PetscScalar w[2000]; /* very BAD need to fix */
4081     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4082   }
4083 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4084   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4085 #else
4086   {
4087     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4088     const MatScalar *v;
4089     PetscInt        jdx,idt,idx,nz,i,ai16;
4090     const PetscInt  *vi;
4091 
4092   /* forward solve the lower triangular */
4093   idx    = 0;
4094   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4095   for (i=1; i<n; i++) {
4096     v     =  aa      + 16*ai[i];
4097     vi    =  aj      + ai[i];
4098     nz    =  diag[i] - ai[i];
4099     idx   +=  4;
4100     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4101     while (nz--) {
4102       jdx   = 4*(*vi++);
4103       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4104       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4105       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4106       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4107       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4108       v    += 16;
4109     }
4110     x[idx]   = s1;
4111     x[1+idx] = s2;
4112     x[2+idx] = s3;
4113     x[3+idx] = s4;
4114   }
4115   /* backward solve the upper triangular */
4116   idt = 4*(n-1);
4117   for (i=n-1; i>=0; i--){
4118     ai16 = 16*diag[i];
4119     v    = aa + ai16 + 16;
4120     vi   = aj + diag[i] + 1;
4121     nz   = ai[i+1] - diag[i] - 1;
4122     s1 = x[idt];  s2 = x[1+idt];
4123     s3 = x[2+idt];s4 = x[3+idt];
4124     while (nz--) {
4125       idx   = 4*(*vi++);
4126       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4127       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4128       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4129       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4130       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4131       v    += 16;
4132     }
4133     v        = aa + ai16;
4134     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4135     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4136     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4137     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4138     idt -= 4;
4139   }
4140   }
4141 #endif
4142 
4143   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4144   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4145   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4146   PetscFunctionReturn(0);
4147 }
4148 
4149 #undef __FUNCT__
4150 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4151 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4152 {
4153     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4154     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4155     PetscInt          i,k,nz,idx,jdx,idt;
4156     PetscErrorCode    ierr;
4157     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4158     const MatScalar   *aa=a->a,*v;
4159     PetscScalar       *x;
4160     const PetscScalar *b;
4161     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4162 
4163     PetscFunctionBegin;
4164     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4165     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4166     /* forward solve the lower triangular */
4167     idx    = 0;
4168     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4169     for (i=1; i<n; i++) {
4170        v    = aa + bs2*ai[i];
4171        vi   = aj + ai[i];
4172        nz   = ai[i+1] - ai[i];
4173       idx   = bs*i;
4174        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4175       for(k=0;k<nz;k++) {
4176           jdx   = bs*vi[k];
4177           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4178           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4179           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4180           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4181 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4182 
4183           v   +=  bs2;
4184         }
4185 
4186        x[idx]   = s1;
4187        x[1+idx] = s2;
4188        x[2+idx] = s3;
4189        x[3+idx] = s4;
4190     }
4191 
4192    /* backward solve the upper triangular */
4193   for (i=n-1; i>=0; i--){
4194     v   = aa + bs2*(adiag[i+1]+1);
4195      vi  = aj + adiag[i+1]+1;
4196      nz  = adiag[i] - adiag[i+1]-1;
4197      idt = bs*i;
4198      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4199 
4200     for(k=0;k<nz;k++){
4201       idx   = bs*vi[k];
4202        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4203        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4204        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4205        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4206        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4207 
4208         v   +=  bs2;
4209     }
4210     /* x = inv_diagonal*x */
4211    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4212    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4213    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4214    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4215 
4216   }
4217 
4218   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4219   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4220   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4221   PetscFunctionReturn(0);
4222 }
4223 
4224 #undef __FUNCT__
4225 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4226 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4227 {
4228   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4229   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4230   PetscErrorCode    ierr;
4231   const MatScalar   *aa=a->a;
4232   const PetscScalar *b;
4233   PetscScalar       *x;
4234 
4235   PetscFunctionBegin;
4236   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4237   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4238 
4239   {
4240     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4241     const MatScalar  *v;
4242     MatScalar        *t=(MatScalar *)x;
4243     PetscInt         jdx,idt,idx,nz,i,ai16;
4244     const PetscInt   *vi;
4245 
4246     /* forward solve the lower triangular */
4247     idx  = 0;
4248     t[0] = (MatScalar)b[0];
4249     t[1] = (MatScalar)b[1];
4250     t[2] = (MatScalar)b[2];
4251     t[3] = (MatScalar)b[3];
4252     for (i=1; i<n; i++) {
4253       v     =  aa      + 16*ai[i];
4254       vi    =  aj      + ai[i];
4255       nz    =  diag[i] - ai[i];
4256       idx   +=  4;
4257       s1 = (MatScalar)b[idx];
4258       s2 = (MatScalar)b[1+idx];
4259       s3 = (MatScalar)b[2+idx];
4260       s4 = (MatScalar)b[3+idx];
4261       while (nz--) {
4262         jdx = 4*(*vi++);
4263         x1  = t[jdx];
4264         x2  = t[1+jdx];
4265         x3  = t[2+jdx];
4266         x4  = t[3+jdx];
4267         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4268         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4269         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4270         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4271         v    += 16;
4272       }
4273       t[idx]   = s1;
4274       t[1+idx] = s2;
4275       t[2+idx] = s3;
4276       t[3+idx] = s4;
4277     }
4278     /* backward solve the upper triangular */
4279     idt = 4*(n-1);
4280     for (i=n-1; i>=0; i--){
4281       ai16 = 16*diag[i];
4282       v    = aa + ai16 + 16;
4283       vi   = aj + diag[i] + 1;
4284       nz   = ai[i+1] - diag[i] - 1;
4285       s1   = t[idt];
4286       s2   = t[1+idt];
4287       s3   = t[2+idt];
4288       s4   = t[3+idt];
4289       while (nz--) {
4290         idx = 4*(*vi++);
4291         x1  = (MatScalar)x[idx];
4292         x2  = (MatScalar)x[1+idx];
4293         x3  = (MatScalar)x[2+idx];
4294         x4  = (MatScalar)x[3+idx];
4295         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4296         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4297         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4298         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4299         v    += 16;
4300       }
4301       v        = aa + ai16;
4302       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4303       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4304       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4305       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4306       idt -= 4;
4307     }
4308   }
4309 
4310   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4311   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4312   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4313   PetscFunctionReturn(0);
4314 }
4315 
4316 #if defined (PETSC_HAVE_SSE)
4317 
4318 #include PETSC_HAVE_SSE
4319 #undef __FUNCT__
4320 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4321 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4322 {
4323   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4324   unsigned short *aj=(unsigned short *)a->j;
4325   PetscErrorCode ierr;
4326   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4327   MatScalar      *aa=a->a;
4328   PetscScalar    *x,*b;
4329 
4330   PetscFunctionBegin;
4331   SSE_SCOPE_BEGIN;
4332   /*
4333      Note: This code currently uses demotion of double
4334      to float when performing the mixed-mode computation.
4335      This may not be numerically reasonable for all applications.
4336   */
4337   PREFETCH_NTA(aa+16*ai[1]);
4338 
4339   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4341   {
4342     /* x will first be computed in single precision then promoted inplace to double */
4343     MatScalar      *v,*t=(MatScalar *)x;
4344     int            nz,i,idt,ai16;
4345     unsigned int   jdx,idx;
4346     unsigned short *vi;
4347     /* Forward solve the lower triangular factor. */
4348 
4349     /* First block is the identity. */
4350     idx  = 0;
4351     CONVERT_DOUBLE4_FLOAT4(t,b);
4352     v    =  aa + 16*((unsigned int)ai[1]);
4353 
4354     for (i=1; i<n;) {
4355       PREFETCH_NTA(&v[8]);
4356       vi   =  aj      + ai[i];
4357       nz   =  diag[i] - ai[i];
4358       idx +=  4;
4359 
4360       /* Demote RHS from double to float. */
4361       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4362       LOAD_PS(&t[idx],XMM7);
4363 
4364       while (nz--) {
4365         PREFETCH_NTA(&v[16]);
4366         jdx = 4*((unsigned int)(*vi++));
4367 
4368         /* 4x4 Matrix-Vector product with negative accumulation: */
4369         SSE_INLINE_BEGIN_2(&t[jdx],v)
4370           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4371 
4372           /* First Column */
4373           SSE_COPY_PS(XMM0,XMM6)
4374           SSE_SHUFFLE(XMM0,XMM0,0x00)
4375           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4376           SSE_SUB_PS(XMM7,XMM0)
4377 
4378           /* Second Column */
4379           SSE_COPY_PS(XMM1,XMM6)
4380           SSE_SHUFFLE(XMM1,XMM1,0x55)
4381           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4382           SSE_SUB_PS(XMM7,XMM1)
4383 
4384           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4385 
4386           /* Third Column */
4387           SSE_COPY_PS(XMM2,XMM6)
4388           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4389           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4390           SSE_SUB_PS(XMM7,XMM2)
4391 
4392           /* Fourth Column */
4393           SSE_COPY_PS(XMM3,XMM6)
4394           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4395           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4396           SSE_SUB_PS(XMM7,XMM3)
4397         SSE_INLINE_END_2
4398 
4399         v  += 16;
4400       }
4401       v    =  aa + 16*ai[++i];
4402       PREFETCH_NTA(v);
4403       STORE_PS(&t[idx],XMM7);
4404     }
4405 
4406     /* Backward solve the upper triangular factor.*/
4407 
4408     idt  = 4*(n-1);
4409     ai16 = 16*diag[n-1];
4410     v    = aa + ai16 + 16;
4411     for (i=n-1; i>=0;){
4412       PREFETCH_NTA(&v[8]);
4413       vi = aj + diag[i] + 1;
4414       nz = ai[i+1] - diag[i] - 1;
4415 
4416       LOAD_PS(&t[idt],XMM7);
4417 
4418       while (nz--) {
4419         PREFETCH_NTA(&v[16]);
4420         idx = 4*((unsigned int)(*vi++));
4421 
4422         /* 4x4 Matrix-Vector Product with negative accumulation: */
4423         SSE_INLINE_BEGIN_2(&t[idx],v)
4424           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4425 
4426           /* First Column */
4427           SSE_COPY_PS(XMM0,XMM6)
4428           SSE_SHUFFLE(XMM0,XMM0,0x00)
4429           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4430           SSE_SUB_PS(XMM7,XMM0)
4431 
4432           /* Second Column */
4433           SSE_COPY_PS(XMM1,XMM6)
4434           SSE_SHUFFLE(XMM1,XMM1,0x55)
4435           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4436           SSE_SUB_PS(XMM7,XMM1)
4437 
4438           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4439 
4440           /* Third Column */
4441           SSE_COPY_PS(XMM2,XMM6)
4442           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4443           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4444           SSE_SUB_PS(XMM7,XMM2)
4445 
4446           /* Fourth Column */
4447           SSE_COPY_PS(XMM3,XMM6)
4448           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4449           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4450           SSE_SUB_PS(XMM7,XMM3)
4451         SSE_INLINE_END_2
4452         v  += 16;
4453       }
4454       v    = aa + ai16;
4455       ai16 = 16*diag[--i];
4456       PREFETCH_NTA(aa+ai16+16);
4457       /*
4458          Scale the result by the diagonal 4x4 block,
4459          which was inverted as part of the factorization
4460       */
4461       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4462         /* First Column */
4463         SSE_COPY_PS(XMM0,XMM7)
4464         SSE_SHUFFLE(XMM0,XMM0,0x00)
4465         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4466 
4467         /* Second Column */
4468         SSE_COPY_PS(XMM1,XMM7)
4469         SSE_SHUFFLE(XMM1,XMM1,0x55)
4470         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4471         SSE_ADD_PS(XMM0,XMM1)
4472 
4473         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4474 
4475         /* Third Column */
4476         SSE_COPY_PS(XMM2,XMM7)
4477         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4478         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4479         SSE_ADD_PS(XMM0,XMM2)
4480 
4481         /* Fourth Column */
4482         SSE_COPY_PS(XMM3,XMM7)
4483         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4484         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4485         SSE_ADD_PS(XMM0,XMM3)
4486 
4487         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4488       SSE_INLINE_END_3
4489 
4490       v    = aa + ai16 + 16;
4491       idt -= 4;
4492     }
4493 
4494     /* Convert t from single precision back to double precision (inplace)*/
4495     idt = 4*(n-1);
4496     for (i=n-1;i>=0;i--) {
4497       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4498       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4499       PetscScalar *xtemp=&x[idt];
4500       MatScalar   *ttemp=&t[idt];
4501       xtemp[3] = (PetscScalar)ttemp[3];
4502       xtemp[2] = (PetscScalar)ttemp[2];
4503       xtemp[1] = (PetscScalar)ttemp[1];
4504       xtemp[0] = (PetscScalar)ttemp[0];
4505       idt -= 4;
4506     }
4507 
4508   } /* End of artificial scope. */
4509   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4510   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4511   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4512   SSE_SCOPE_END;
4513   PetscFunctionReturn(0);
4514 }
4515 
4516 #undef __FUNCT__
4517 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4518 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4519 {
4520   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4521   int            *aj=a->j;
4522   PetscErrorCode ierr;
4523   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4524   MatScalar      *aa=a->a;
4525   PetscScalar    *x,*b;
4526 
4527   PetscFunctionBegin;
4528   SSE_SCOPE_BEGIN;
4529   /*
4530      Note: This code currently uses demotion of double
4531      to float when performing the mixed-mode computation.
4532      This may not be numerically reasonable for all applications.
4533   */
4534   PREFETCH_NTA(aa+16*ai[1]);
4535 
4536   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4537   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4538   {
4539     /* x will first be computed in single precision then promoted inplace to double */
4540     MatScalar *v,*t=(MatScalar *)x;
4541     int       nz,i,idt,ai16;
4542     int       jdx,idx;
4543     int       *vi;
4544     /* Forward solve the lower triangular factor. */
4545 
4546     /* First block is the identity. */
4547     idx  = 0;
4548     CONVERT_DOUBLE4_FLOAT4(t,b);
4549     v    =  aa + 16*ai[1];
4550 
4551     for (i=1; i<n;) {
4552       PREFETCH_NTA(&v[8]);
4553       vi   =  aj      + ai[i];
4554       nz   =  diag[i] - ai[i];
4555       idx +=  4;
4556 
4557       /* Demote RHS from double to float. */
4558       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4559       LOAD_PS(&t[idx],XMM7);
4560 
4561       while (nz--) {
4562         PREFETCH_NTA(&v[16]);
4563         jdx = 4*(*vi++);
4564 /*          jdx = *vi++; */
4565 
4566         /* 4x4 Matrix-Vector product with negative accumulation: */
4567         SSE_INLINE_BEGIN_2(&t[jdx],v)
4568           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4569 
4570           /* First Column */
4571           SSE_COPY_PS(XMM0,XMM6)
4572           SSE_SHUFFLE(XMM0,XMM0,0x00)
4573           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4574           SSE_SUB_PS(XMM7,XMM0)
4575 
4576           /* Second Column */
4577           SSE_COPY_PS(XMM1,XMM6)
4578           SSE_SHUFFLE(XMM1,XMM1,0x55)
4579           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4580           SSE_SUB_PS(XMM7,XMM1)
4581 
4582           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4583 
4584           /* Third Column */
4585           SSE_COPY_PS(XMM2,XMM6)
4586           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4587           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4588           SSE_SUB_PS(XMM7,XMM2)
4589 
4590           /* Fourth Column */
4591           SSE_COPY_PS(XMM3,XMM6)
4592           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4593           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4594           SSE_SUB_PS(XMM7,XMM3)
4595         SSE_INLINE_END_2
4596 
4597         v  += 16;
4598       }
4599       v    =  aa + 16*ai[++i];
4600       PREFETCH_NTA(v);
4601       STORE_PS(&t[idx],XMM7);
4602     }
4603 
4604     /* Backward solve the upper triangular factor.*/
4605 
4606     idt  = 4*(n-1);
4607     ai16 = 16*diag[n-1];
4608     v    = aa + ai16 + 16;
4609     for (i=n-1; i>=0;){
4610       PREFETCH_NTA(&v[8]);
4611       vi = aj + diag[i] + 1;
4612       nz = ai[i+1] - diag[i] - 1;
4613 
4614       LOAD_PS(&t[idt],XMM7);
4615 
4616       while (nz--) {
4617         PREFETCH_NTA(&v[16]);
4618         idx = 4*(*vi++);
4619 /*          idx = *vi++; */
4620 
4621         /* 4x4 Matrix-Vector Product with negative accumulation: */
4622         SSE_INLINE_BEGIN_2(&t[idx],v)
4623           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4624 
4625           /* First Column */
4626           SSE_COPY_PS(XMM0,XMM6)
4627           SSE_SHUFFLE(XMM0,XMM0,0x00)
4628           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4629           SSE_SUB_PS(XMM7,XMM0)
4630 
4631           /* Second Column */
4632           SSE_COPY_PS(XMM1,XMM6)
4633           SSE_SHUFFLE(XMM1,XMM1,0x55)
4634           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4635           SSE_SUB_PS(XMM7,XMM1)
4636 
4637           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4638 
4639           /* Third Column */
4640           SSE_COPY_PS(XMM2,XMM6)
4641           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4642           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4643           SSE_SUB_PS(XMM7,XMM2)
4644 
4645           /* Fourth Column */
4646           SSE_COPY_PS(XMM3,XMM6)
4647           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4648           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4649           SSE_SUB_PS(XMM7,XMM3)
4650         SSE_INLINE_END_2
4651         v  += 16;
4652       }
4653       v    = aa + ai16;
4654       ai16 = 16*diag[--i];
4655       PREFETCH_NTA(aa+ai16+16);
4656       /*
4657          Scale the result by the diagonal 4x4 block,
4658          which was inverted as part of the factorization
4659       */
4660       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4661         /* First Column */
4662         SSE_COPY_PS(XMM0,XMM7)
4663         SSE_SHUFFLE(XMM0,XMM0,0x00)
4664         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4665 
4666         /* Second Column */
4667         SSE_COPY_PS(XMM1,XMM7)
4668         SSE_SHUFFLE(XMM1,XMM1,0x55)
4669         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4670         SSE_ADD_PS(XMM0,XMM1)
4671 
4672         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4673 
4674         /* Third Column */
4675         SSE_COPY_PS(XMM2,XMM7)
4676         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4677         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4678         SSE_ADD_PS(XMM0,XMM2)
4679 
4680         /* Fourth Column */
4681         SSE_COPY_PS(XMM3,XMM7)
4682         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4683         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4684         SSE_ADD_PS(XMM0,XMM3)
4685 
4686         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4687       SSE_INLINE_END_3
4688 
4689       v    = aa + ai16 + 16;
4690       idt -= 4;
4691     }
4692 
4693     /* Convert t from single precision back to double precision (inplace)*/
4694     idt = 4*(n-1);
4695     for (i=n-1;i>=0;i--) {
4696       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4697       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4698       PetscScalar *xtemp=&x[idt];
4699       MatScalar   *ttemp=&t[idt];
4700       xtemp[3] = (PetscScalar)ttemp[3];
4701       xtemp[2] = (PetscScalar)ttemp[2];
4702       xtemp[1] = (PetscScalar)ttemp[1];
4703       xtemp[0] = (PetscScalar)ttemp[0];
4704       idt -= 4;
4705     }
4706 
4707   } /* End of artificial scope. */
4708   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4710   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4711   SSE_SCOPE_END;
4712   PetscFunctionReturn(0);
4713 }
4714 
4715 #endif
4716 
4717 #undef __FUNCT__
4718 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4719 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4720 {
4721   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4722   IS                iscol=a->col,isrow=a->row;
4723   PetscErrorCode    ierr;
4724   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4725   PetscInt          i,nz,idx,idt,idc;
4726   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4727   const MatScalar   *aa=a->a,*v;
4728   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4729   const PetscScalar *b;
4730 
4731   PetscFunctionBegin;
4732   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4733   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4734   t  = a->solve_work;
4735 
4736   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4737   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4738 
4739   /* forward solve the lower triangular */
4740   idx    = 3*(*r++);
4741   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4742   for (i=1; i<n; i++) {
4743     v     = aa + 9*ai[i];
4744     vi    = aj + ai[i];
4745     nz    = diag[i] - ai[i];
4746     idx   = 3*(*r++);
4747     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4748     while (nz--) {
4749       idx   = 3*(*vi++);
4750       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4751       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4752       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4753       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4754       v += 9;
4755     }
4756     idx = 3*i;
4757     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4758   }
4759   /* backward solve the upper triangular */
4760   for (i=n-1; i>=0; i--){
4761     v    = aa + 9*diag[i] + 9;
4762     vi   = aj + diag[i] + 1;
4763     nz   = ai[i+1] - diag[i] - 1;
4764     idt  = 3*i;
4765     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4766     while (nz--) {
4767       idx   = 3*(*vi++);
4768       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4769       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4770       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4771       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4772       v += 9;
4773     }
4774     idc = 3*(*c--);
4775     v   = aa + 9*diag[i];
4776     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4777     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4778     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4779   }
4780   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4781   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4782   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4783   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4784   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4785   PetscFunctionReturn(0);
4786 }
4787 
4788 #undef __FUNCT__
4789 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4790 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4791 {
4792   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4793   IS                iscol=a->col,isrow=a->row;
4794   PetscErrorCode    ierr;
4795   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4796   PetscInt          i,nz,idx,idt,idc,m;
4797   const PetscInt    *r,*c,*rout,*cout;
4798   const MatScalar   *aa=a->a,*v;
4799   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4800   const PetscScalar *b;
4801 
4802   PetscFunctionBegin;
4803   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4804   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4805   t  = a->solve_work;
4806 
4807   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4808   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4809 
4810   /* forward solve the lower triangular */
4811   idx    = 3*r[0];
4812   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4813   for (i=1; i<n; i++) {
4814     v     = aa + 9*ai[i];
4815     vi    = aj + ai[i];
4816     nz    = ai[i+1] - ai[i];
4817     idx   = 3*r[i];
4818     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4819     for(m=0;m<nz;m++){
4820       idx   = 3*vi[m];
4821       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4822       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4823       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4824       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4825       v += 9;
4826     }
4827     idx = 3*i;
4828     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4829   }
4830   /* backward solve the upper triangular */
4831   for (i=n-1; i>=0; i--){
4832     v    = aa + 9*(adiag[i+1]+1);
4833     vi   = aj + adiag[i+1]+1;
4834     nz   = adiag[i] - adiag[i+1] - 1;
4835     idt  = 3*i;
4836     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4837     for(m=0;m<nz;m++){
4838       idx   = 3*vi[m];
4839       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4840       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4841       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4842       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4843       v += 9;
4844     }
4845     idc = 3*c[i];
4846     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4847     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4848     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4849   }
4850   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4851   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4852   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4853   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4854   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4855   PetscFunctionReturn(0);
4856 }
4857 
4858 /*
4859       Special case where the matrix was ILU(0) factored in the natural
4860    ordering. This eliminates the need for the column and row permutation.
4861 */
4862 #undef __FUNCT__
4863 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4864 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4865 {
4866   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4867   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4868   PetscErrorCode    ierr;
4869   const PetscInt    *diag = a->diag,*vi;
4870   const MatScalar   *aa=a->a,*v;
4871   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4872   const PetscScalar *b;
4873   PetscInt          jdx,idt,idx,nz,i;
4874 
4875   PetscFunctionBegin;
4876   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4877   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4878 
4879   /* forward solve the lower triangular */
4880   idx    = 0;
4881   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4882   for (i=1; i<n; i++) {
4883     v     =  aa      + 9*ai[i];
4884     vi    =  aj      + ai[i];
4885     nz    =  diag[i] - ai[i];
4886     idx   +=  3;
4887     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4888     while (nz--) {
4889       jdx   = 3*(*vi++);
4890       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4891       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4892       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4893       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4894       v    += 9;
4895     }
4896     x[idx]   = s1;
4897     x[1+idx] = s2;
4898     x[2+idx] = s3;
4899   }
4900   /* backward solve the upper triangular */
4901   for (i=n-1; i>=0; i--){
4902     v    = aa + 9*diag[i] + 9;
4903     vi   = aj + diag[i] + 1;
4904     nz   = ai[i+1] - diag[i] - 1;
4905     idt  = 3*i;
4906     s1 = x[idt];  s2 = x[1+idt];
4907     s3 = x[2+idt];
4908     while (nz--) {
4909       idx   = 3*(*vi++);
4910       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4911       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4912       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4913       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4914       v    += 9;
4915     }
4916     v        = aa +  9*diag[i];
4917     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4918     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4919     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4920   }
4921 
4922   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4923   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4924   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4925   PetscFunctionReturn(0);
4926 }
4927 
4928 #undef __FUNCT__
4929 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4930 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4931 {
4932     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4933     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4934     PetscErrorCode    ierr;
4935     PetscInt          i,k,nz,idx,jdx,idt;
4936     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4937     const MatScalar   *aa=a->a,*v;
4938     PetscScalar       *x;
4939     const PetscScalar *b;
4940     PetscScalar        s1,s2,s3,x1,x2,x3;
4941 
4942     PetscFunctionBegin;
4943     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4944     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4945     /* forward solve the lower triangular */
4946     idx    = 0;
4947     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4948     for (i=1; i<n; i++) {
4949        v    = aa + bs2*ai[i];
4950        vi   = aj + ai[i];
4951        nz   = ai[i+1] - ai[i];
4952       idx   = bs*i;
4953        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4954       for(k=0;k<nz;k++){
4955          jdx   = bs*vi[k];
4956           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4957           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4958           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4959           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4960 
4961           v   +=  bs2;
4962         }
4963 
4964        x[idx]   = s1;
4965        x[1+idx] = s2;
4966        x[2+idx] = s3;
4967     }
4968 
4969    /* backward solve the upper triangular */
4970   for (i=n-1; i>=0; i--){
4971     v   = aa + bs2*(adiag[i+1]+1);
4972      vi  = aj + adiag[i+1]+1;
4973      nz  = adiag[i] - adiag[i+1]-1;
4974      idt = bs*i;
4975      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4976 
4977      for(k=0;k<nz;k++){
4978        idx   = bs*vi[k];
4979        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4980        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4981        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4982        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4983 
4984         v   +=  bs2;
4985     }
4986     /* x = inv_diagonal*x */
4987    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4988    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4989    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4990 
4991   }
4992 
4993   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4994   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4995   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4996   PetscFunctionReturn(0);
4997 }
4998 
4999 #undef __FUNCT__
5000 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
5001 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
5002 {
5003   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5004   IS                iscol=a->col,isrow=a->row;
5005   PetscErrorCode    ierr;
5006   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5007   PetscInt          i,nz,idx,idt,idc;
5008   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5009   const MatScalar   *aa=a->a,*v;
5010   PetscScalar       *x,s1,s2,x1,x2,*t;
5011   const PetscScalar *b;
5012 
5013   PetscFunctionBegin;
5014   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5015   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5016   t  = a->solve_work;
5017 
5018   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5019   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5020 
5021   /* forward solve the lower triangular */
5022   idx    = 2*(*r++);
5023   t[0] = b[idx]; t[1] = b[1+idx];
5024   for (i=1; i<n; i++) {
5025     v     = aa + 4*ai[i];
5026     vi    = aj + ai[i];
5027     nz    = diag[i] - ai[i];
5028     idx   = 2*(*r++);
5029     s1  = b[idx]; s2 = b[1+idx];
5030     while (nz--) {
5031       idx   = 2*(*vi++);
5032       x1    = t[idx]; x2 = t[1+idx];
5033       s1 -= v[0]*x1 + v[2]*x2;
5034       s2 -= v[1]*x1 + v[3]*x2;
5035       v += 4;
5036     }
5037     idx = 2*i;
5038     t[idx] = s1; t[1+idx] = s2;
5039   }
5040   /* backward solve the upper triangular */
5041   for (i=n-1; i>=0; i--){
5042     v    = aa + 4*diag[i] + 4;
5043     vi   = aj + diag[i] + 1;
5044     nz   = ai[i+1] - diag[i] - 1;
5045     idt  = 2*i;
5046     s1 = t[idt]; s2 = t[1+idt];
5047     while (nz--) {
5048       idx   = 2*(*vi++);
5049       x1    = t[idx]; x2 = t[1+idx];
5050       s1 -= v[0]*x1 + v[2]*x2;
5051       s2 -= v[1]*x1 + v[3]*x2;
5052       v += 4;
5053     }
5054     idc = 2*(*c--);
5055     v   = aa + 4*diag[i];
5056     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5057     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5058   }
5059   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5060   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5061   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5062   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5063   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5064   PetscFunctionReturn(0);
5065 }
5066 
5067 #undef __FUNCT__
5068 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
5069 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5070 {
5071   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5072   IS                iscol=a->col,isrow=a->row;
5073   PetscErrorCode    ierr;
5074   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5075   PetscInt          i,nz,idx,jdx,idt,idc,m;
5076   const PetscInt    *r,*c,*rout,*cout;
5077   const MatScalar   *aa=a->a,*v;
5078   PetscScalar       *x,s1,s2,x1,x2,*t;
5079   const PetscScalar *b;
5080 
5081   PetscFunctionBegin;
5082   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5083   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5084   t  = a->solve_work;
5085 
5086   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5087   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5088 
5089   /* forward solve the lower triangular */
5090   idx    = 2*r[0];
5091   t[0] = b[idx]; t[1] = b[1+idx];
5092   for (i=1; i<n; i++) {
5093     v     = aa + 4*ai[i];
5094     vi    = aj + ai[i];
5095     nz    = ai[i+1] - ai[i];
5096     idx   = 2*r[i];
5097     s1  = b[idx]; s2 = b[1+idx];
5098     for(m=0;m<nz;m++){
5099       jdx   = 2*vi[m];
5100       x1    = t[jdx]; x2 = t[1+jdx];
5101       s1 -= v[0]*x1 + v[2]*x2;
5102       s2 -= v[1]*x1 + v[3]*x2;
5103       v += 4;
5104     }
5105     idx = 2*i;
5106     t[idx] = s1; t[1+idx] = s2;
5107   }
5108   /* backward solve the upper triangular */
5109   for (i=n-1; i>=0; i--){
5110     v    = aa + 4*(adiag[i+1]+1);
5111     vi   = aj + adiag[i+1]+1;
5112     nz   = adiag[i] - adiag[i+1] - 1;
5113     idt  = 2*i;
5114     s1 = t[idt]; s2 = t[1+idt];
5115     for(m=0;m<nz;m++){
5116       idx   = 2*vi[m];
5117       x1    = t[idx]; x2 = t[1+idx];
5118       s1 -= v[0]*x1 + v[2]*x2;
5119       s2 -= v[1]*x1 + v[3]*x2;
5120       v += 4;
5121     }
5122     idc = 2*c[i];
5123     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5124     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5125   }
5126   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5127   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5128   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5129   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5130   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5131   PetscFunctionReturn(0);
5132 }
5133 
5134 /*
5135       Special case where the matrix was ILU(0) factored in the natural
5136    ordering. This eliminates the need for the column and row permutation.
5137 */
5138 #undef __FUNCT__
5139 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5140 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5141 {
5142   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5143   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5144   PetscErrorCode    ierr;
5145   const MatScalar   *aa=a->a,*v;
5146   PetscScalar       *x,s1,s2,x1,x2;
5147   const PetscScalar *b;
5148   PetscInt          jdx,idt,idx,nz,i;
5149 
5150   PetscFunctionBegin;
5151   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5152   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5153 
5154   /* forward solve the lower triangular */
5155   idx    = 0;
5156   x[0]   = b[0]; x[1] = b[1];
5157   for (i=1; i<n; i++) {
5158     v     =  aa      + 4*ai[i];
5159     vi    =  aj      + ai[i];
5160     nz    =  diag[i] - ai[i];
5161     idx   +=  2;
5162     s1  =  b[idx];s2 = b[1+idx];
5163     while (nz--) {
5164       jdx   = 2*(*vi++);
5165       x1    = x[jdx];x2 = x[1+jdx];
5166       s1 -= v[0]*x1 + v[2]*x2;
5167       s2 -= v[1]*x1 + v[3]*x2;
5168       v    += 4;
5169     }
5170     x[idx]   = s1;
5171     x[1+idx] = s2;
5172   }
5173   /* backward solve the upper triangular */
5174   for (i=n-1; i>=0; i--){
5175     v    = aa + 4*diag[i] + 4;
5176     vi   = aj + diag[i] + 1;
5177     nz   = ai[i+1] - diag[i] - 1;
5178     idt  = 2*i;
5179     s1 = x[idt];  s2 = x[1+idt];
5180     while (nz--) {
5181       idx   = 2*(*vi++);
5182       x1    = x[idx];   x2 = x[1+idx];
5183       s1 -= v[0]*x1 + v[2]*x2;
5184       s2 -= v[1]*x1 + v[3]*x2;
5185       v    += 4;
5186     }
5187     v        = aa +  4*diag[i];
5188     x[idt]   = v[0]*s1 + v[2]*s2;
5189     x[1+idt] = v[1]*s1 + v[3]*s2;
5190   }
5191 
5192   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5193   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5194   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5195   PetscFunctionReturn(0);
5196 }
5197 
5198 #undef __FUNCT__
5199 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5200 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5201 {
5202     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5203     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5204     PetscInt          i,k,nz,idx,idt,jdx;
5205     PetscErrorCode    ierr;
5206     const MatScalar   *aa=a->a,*v;
5207     PetscScalar       *x,s1,s2,x1,x2;
5208     const PetscScalar *b;
5209 
5210     PetscFunctionBegin;
5211     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5212     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5213     /* forward solve the lower triangular */
5214     idx    = 0;
5215     x[0] = b[idx]; x[1] = b[1+idx];
5216     for (i=1; i<n; i++) {
5217         v   = aa + 4*ai[i];
5218        vi   = aj + ai[i];
5219        nz   = ai[i+1] - ai[i];
5220        idx  = 2*i;
5221        s1   = b[idx];s2 = b[1+idx];
5222       for(k=0;k<nz;k++){
5223          jdx   = 2*vi[k];
5224           x1    = x[jdx];x2 = x[1+jdx];
5225           s1   -= v[0]*x1 + v[2]*x2;
5226           s2   -= v[1]*x1 + v[3]*x2;
5227            v   +=  4;
5228         }
5229        x[idx]   = s1;
5230        x[1+idx] = s2;
5231     }
5232 
5233    /* backward solve the upper triangular */
5234   for (i=n-1; i>=0; i--){
5235      v   = aa + 4*(adiag[i+1]+1);
5236      vi  = aj + adiag[i+1]+1;
5237      nz  = adiag[i] - adiag[i+1]-1;
5238      idt = 2*i;
5239      s1 = x[idt];  s2 = x[1+idt];
5240      for(k=0;k<nz;k++){
5241       idx   = 2*vi[k];
5242        x1    = x[idx];   x2 = x[1+idx];
5243        s1 -= v[0]*x1 + v[2]*x2;
5244        s2 -= v[1]*x1 + v[3]*x2;
5245          v    += 4;
5246     }
5247     /* x = inv_diagonal*x */
5248    x[idt]   = v[0]*s1 + v[2]*s2;
5249    x[1+idt] = v[1]*s1 + v[3]*s2;
5250   }
5251 
5252   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5253   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5254   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5255   PetscFunctionReturn(0);
5256 }
5257 
5258 #undef __FUNCT__
5259 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5260 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5261 {
5262   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5263   IS                iscol=a->col,isrow=a->row;
5264   PetscErrorCode    ierr;
5265   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5266   PetscInt          i,nz;
5267   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5268   const MatScalar   *aa=a->a,*v;
5269   PetscScalar       *x,s1,*t;
5270   const PetscScalar *b;
5271 
5272   PetscFunctionBegin;
5273   if (!n) PetscFunctionReturn(0);
5274 
5275   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5276   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5277   t  = a->solve_work;
5278 
5279   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5280   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5281 
5282   /* forward solve the lower triangular */
5283   t[0] = b[*r++];
5284   for (i=1; i<n; i++) {
5285     v     = aa + ai[i];
5286     vi    = aj + ai[i];
5287     nz    = diag[i] - ai[i];
5288     s1  = b[*r++];
5289     while (nz--) {
5290       s1 -= (*v++)*t[*vi++];
5291     }
5292     t[i] = s1;
5293   }
5294   /* backward solve the upper triangular */
5295   for (i=n-1; i>=0; i--){
5296     v    = aa + diag[i] + 1;
5297     vi   = aj + diag[i] + 1;
5298     nz   = ai[i+1] - diag[i] - 1;
5299     s1 = t[i];
5300     while (nz--) {
5301       s1 -= (*v++)*t[*vi++];
5302     }
5303     x[*c--] = t[i] = aa[diag[i]]*s1;
5304   }
5305 
5306   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5307   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5308   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5309   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5310   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5311   PetscFunctionReturn(0);
5312 }
5313 /*
5314       Special case where the matrix was ILU(0) factored in the natural
5315    ordering. This eliminates the need for the column and row permutation.
5316 */
5317 #undef __FUNCT__
5318 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5319 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5320 {
5321   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5322   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5323   PetscErrorCode    ierr;
5324   const MatScalar   *aa=a->a,*v;
5325   PetscScalar       *x;
5326   const PetscScalar *b;
5327   PetscScalar       s1,x1;
5328   PetscInt          jdx,idt,idx,nz,i;
5329 
5330   PetscFunctionBegin;
5331   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5332   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5333 
5334   /* forward solve the lower triangular */
5335   idx    = 0;
5336   x[0]   = b[0];
5337   for (i=1; i<n; i++) {
5338     v     =  aa      + ai[i];
5339     vi    =  aj      + ai[i];
5340     nz    =  diag[i] - ai[i];
5341     idx   +=  1;
5342     s1  =  b[idx];
5343     while (nz--) {
5344       jdx   = *vi++;
5345       x1    = x[jdx];
5346       s1 -= v[0]*x1;
5347       v    += 1;
5348     }
5349     x[idx]   = s1;
5350   }
5351   /* backward solve the upper triangular */
5352   for (i=n-1; i>=0; i--){
5353     v    = aa + diag[i] + 1;
5354     vi   = aj + diag[i] + 1;
5355     nz   = ai[i+1] - diag[i] - 1;
5356     idt  = i;
5357     s1 = x[idt];
5358     while (nz--) {
5359       idx   = *vi++;
5360       x1    = x[idx];
5361       s1 -= v[0]*x1;
5362       v    += 1;
5363     }
5364     v        = aa +  diag[i];
5365     x[idt]   = v[0]*s1;
5366   }
5367   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5369   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5370   PetscFunctionReturn(0);
5371 }
5372 
5373 /* ----------------------------------------------------------------*/
5374 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5375 
5376 #undef __FUNCT__
5377 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5378 /*
5379    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5380 */
5381 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5382 {
5383   Mat             C=B;
5384   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5385   PetscErrorCode  ierr;
5386   PetscInt        i,j,k,ipvt[15];
5387   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5388   PetscInt        nz,nzL,row;
5389   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5390   const MatScalar *v,*aa=a->a;
5391   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5392 
5393   PetscFunctionBegin;
5394 
5395   /* generate work space needed by the factorization */
5396   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5397   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5398 
5399   for (i=0; i<n; i++){
5400     /* zero rtmp */
5401     /* L part */
5402     nz    = bi[i+1] - bi[i];
5403     bjtmp = bj + bi[i];
5404     for  (j=0; j<nz; j++){
5405       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5406     }
5407 
5408     /* U part */
5409     nz = bdiag[i] - bdiag[i+1];
5410     bjtmp = bj + bdiag[i+1]+1;
5411     for  (j=0; j<nz; j++){
5412       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5413     }
5414 
5415     /* load in initial (unfactored row) */
5416     nz    = ai[i+1] - ai[i];
5417     ajtmp = aj + ai[i];
5418     v     = aa + bs2*ai[i];
5419     for (j=0; j<nz; j++) {
5420       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5421     }
5422 
5423     /* elimination */
5424     bjtmp = bj + bi[i];
5425     nzL   = bi[i+1] - bi[i];
5426     for(k=0;k < nzL;k++) {
5427       row = bjtmp[k];
5428       pc = rtmp + bs2*row;
5429       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5430       if (flg) {
5431         pv = b->a + bs2*bdiag[row];
5432 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5433 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5434 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5435         pv = b->a + bs2*(bdiag[row+1]+1);
5436         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5437         for (j=0; j<nz; j++) {
5438           vv   = rtmp + bs2*pj[j];
5439           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5440 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5441 	  pv  += bs2;
5442         }
5443         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5444       }
5445     }
5446 
5447     /* finished row so stick it into b->a */
5448     /* L part */
5449     pv   = b->a + bs2*bi[i] ;
5450     pj   = b->j + bi[i] ;
5451     nz   = bi[i+1] - bi[i];
5452     for (j=0; j<nz; j++) {
5453       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5454     }
5455 
5456     /* Mark diagonal and invert diagonal for simplier triangular solves */
5457     pv   = b->a + bs2*bdiag[i];
5458     pj   = b->j + bdiag[i];
5459     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5460     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5461     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftinblocks);CHKERRQ(ierr);
5462 
5463     /* U part */
5464     pv = b->a + bs2*(bdiag[i+1]+1);
5465     pj = b->j + bdiag[i+1]+1;
5466     nz = bdiag[i] - bdiag[i+1] - 1;
5467     for (j=0; j<nz; j++){
5468       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5469     }
5470   }
5471 
5472   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5473   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering;
5474   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5475   C->assembled = PETSC_TRUE;
5476   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5477   PetscFunctionReturn(0);
5478 }
5479 
5480 #undef __FUNCT__
5481 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5482 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5483 {
5484   Mat            C=B;
5485   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5486   IS             isrow = b->row,isicol = b->icol;
5487   PetscErrorCode ierr;
5488   const PetscInt *r,*ic,*ics;
5489   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5490   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5491   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5492   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5493   MatScalar      *v_work;
5494   PetscTruth     col_identity,row_identity,both_identity;
5495 
5496   PetscFunctionBegin;
5497   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5498   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5499 
5500   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5501   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5502   ics  = ic;
5503 
5504   /* generate work space needed by dense LU factorization */
5505   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5506 
5507   for (i=0; i<n; i++){
5508     /* zero rtmp */
5509     /* L part */
5510     nz    = bi[i+1] - bi[i];
5511     bjtmp = bj + bi[i];
5512     for  (j=0; j<nz; j++){
5513       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5514     }
5515 
5516     /* U part */
5517     nz = bdiag[i] - bdiag[i+1];
5518     bjtmp = bj + bdiag[i+1]+1;
5519     for  (j=0; j<nz; j++){
5520       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5521     }
5522 
5523     /* load in initial (unfactored row) */
5524     nz    = ai[r[i]+1] - ai[r[i]];
5525     ajtmp = aj + ai[r[i]];
5526     v     = aa + bs2*ai[r[i]];
5527     for (j=0; j<nz; j++) {
5528       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5529     }
5530 
5531     /* elimination */
5532     bjtmp = bj + bi[i];
5533     nzL   = bi[i+1] - bi[i];
5534     for(k=0;k < nzL;k++) {
5535       row = bjtmp[k];
5536       pc = rtmp + bs2*row;
5537       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5538       if (flg) {
5539         pv         = b->a + bs2*bdiag[row];
5540         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5541         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5542         pv         = b->a + bs2*(bdiag[row+1]+1);
5543         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5544         for (j=0; j<nz; j++) {
5545           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5546         }
5547         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5548       }
5549     }
5550 
5551     /* finished row so stick it into b->a */
5552     /* L part */
5553     pv   = b->a + bs2*bi[i] ;
5554     pj   = b->j + bi[i] ;
5555     nz   = bi[i+1] - bi[i];
5556     for (j=0; j<nz; j++) {
5557       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5558     }
5559 
5560     /* Mark diagonal and invert diagonal for simplier triangular solves */
5561     pv  = b->a + bs2*bdiag[i];
5562     pj  = b->j + bdiag[i];
5563     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5564     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5565     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5566 
5567     /* U part */
5568     pv = b->a + bs2*(bdiag[i+1]+1);
5569     pj = b->j + bdiag[i+1]+1;
5570     nz = bdiag[i] - bdiag[i+1] - 1;
5571     for (j=0; j<nz; j++){
5572       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5573     }
5574   }
5575 
5576   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5577   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5578   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5579   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5580 
5581   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5582   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5583   both_identity = (PetscTruth) (row_identity && col_identity);
5584   if (both_identity){
5585     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5586   } else {
5587     C->ops->solve = MatSolve_SeqBAIJ_N;
5588   }
5589   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5590 
5591   C->assembled = PETSC_TRUE;
5592   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5593   PetscFunctionReturn(0);
5594 }
5595 
5596 /*
5597    ilu(0) with natural ordering under new data structure.
5598    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5599    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5600 */
5601 
5602 #undef __FUNCT__
5603 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5604 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5605 {
5606 
5607   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5608   PetscErrorCode     ierr;
5609   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5610   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5611 
5612   PetscFunctionBegin;
5613   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5614   b    = (Mat_SeqBAIJ*)(fact)->data;
5615 
5616   /* allocate matrix arrays for new data structure */
5617   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5618   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5619   b->singlemalloc = PETSC_TRUE;
5620   if (!b->diag){
5621     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5622     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5623   }
5624   bdiag = b->diag;
5625 
5626   if (n > 0) {
5627     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5628   }
5629 
5630   /* set bi and bj with new data structure */
5631   bi = b->i;
5632   bj = b->j;
5633 
5634   /* L part */
5635   bi[0] = 0;
5636   for (i=0; i<n; i++){
5637     nz = adiag[i] - ai[i];
5638     bi[i+1] = bi[i] + nz;
5639     aj = a->j + ai[i];
5640     for (j=0; j<nz; j++){
5641       *bj = aj[j]; bj++;
5642     }
5643   }
5644 
5645   /* U part */
5646   bi_temp = bi[n];
5647   bdiag[n] = bi[n]-1;
5648   for (i=n-1; i>=0; i--){
5649     nz = ai[i+1] - adiag[i] - 1;
5650     bi_temp = bi_temp + nz + 1;
5651     aj = a->j + adiag[i] + 1;
5652     for (j=0; j<nz; j++){
5653       *bj = aj[j]; bj++;
5654     }
5655     /* diag[i] */
5656     *bj = i; bj++;
5657     bdiag[i] = bi_temp - 1;
5658   }
5659   PetscFunctionReturn(0);
5660 }
5661 
5662 #undef __FUNCT__
5663 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5664 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5665 {
5666   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5667   IS                 isicol;
5668   PetscErrorCode     ierr;
5669   const PetscInt     *r,*ic;
5670   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5671   PetscInt           *bi,*cols,nnz,*cols_lvl;
5672   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5673   PetscInt           i,levels,diagonal_fill;
5674   PetscTruth         col_identity,row_identity,both_identity;
5675   PetscReal          f;
5676   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5677   PetscBT            lnkbt;
5678   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5679   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5680   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5681   PetscTruth         missing;
5682   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5683 
5684   PetscFunctionBegin;
5685   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5686   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5687   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5688 
5689   f             = info->fill;
5690   levels        = (PetscInt)info->levels;
5691   diagonal_fill = (PetscInt)info->diagonal_fill;
5692   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5693 
5694   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5695   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5696   both_identity = (PetscTruth) (row_identity && col_identity);
5697 
5698   if (!levels && both_identity) {
5699     /* special case: ilu(0) with natural ordering */
5700     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5701     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5702 
5703     fact->factor = MAT_FACTOR_ILU;
5704     (fact)->info.factor_mallocs    = 0;
5705     (fact)->info.fill_ratio_given  = info->fill;
5706     (fact)->info.fill_ratio_needed = 1.0;
5707     b                = (Mat_SeqBAIJ*)(fact)->data;
5708     b->row           = isrow;
5709     b->col           = iscol;
5710     b->icol          = isicol;
5711     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5712     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5713     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5714     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5715     PetscFunctionReturn(0);
5716   }
5717 
5718   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5719   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5720 
5721   /* get new row pointers */
5722   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5723   bi[0] = 0;
5724   /* bdiag is location of diagonal in factor */
5725   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5726   bdiag[0]  = 0;
5727 
5728   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5729 
5730   /* create a linked list for storing column indices of the active row */
5731   nlnk = n + 1;
5732   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5733 
5734   /* initial FreeSpace size is f*(ai[n]+1) */
5735   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5736   current_space = free_space;
5737   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5738   current_space_lvl = free_space_lvl;
5739 
5740   for (i=0; i<n; i++) {
5741     nzi = 0;
5742     /* copy current row into linked list */
5743     nnz  = ai[r[i]+1] - ai[r[i]];
5744     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5745     cols = aj + ai[r[i]];
5746     lnk[i] = -1; /* marker to indicate if diagonal exists */
5747     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5748     nzi += nlnk;
5749 
5750     /* make sure diagonal entry is included */
5751     if (diagonal_fill && lnk[i] == -1) {
5752       fm = n;
5753       while (lnk[fm] < i) fm = lnk[fm];
5754       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5755       lnk[fm]    = i;
5756       lnk_lvl[i] = 0;
5757       nzi++; dcount++;
5758     }
5759 
5760     /* add pivot rows into the active row */
5761     nzbd = 0;
5762     prow = lnk[n];
5763     while (prow < i) {
5764       nnz      = bdiag[prow];
5765       cols     = bj_ptr[prow] + nnz + 1;
5766       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5767       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5768       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5769       nzi += nlnk;
5770       prow = lnk[prow];
5771       nzbd++;
5772     }
5773     bdiag[i] = nzbd;
5774     bi[i+1]  = bi[i] + nzi;
5775 
5776     /* if free space is not available, make more free space */
5777     if (current_space->local_remaining<nzi) {
5778       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5779       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5780       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5781       reallocs++;
5782     }
5783 
5784     /* copy data into free_space and free_space_lvl, then initialize lnk */
5785     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5786     bj_ptr[i]    = current_space->array;
5787     bjlvl_ptr[i] = current_space_lvl->array;
5788 
5789     /* make sure the active row i has diagonal entry */
5790     if (*(bj_ptr[i]+bdiag[i]) != i) {
5791       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5792     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5793     }
5794 
5795     current_space->array           += nzi;
5796     current_space->local_used      += nzi;
5797     current_space->local_remaining -= nzi;
5798     current_space_lvl->array           += nzi;
5799     current_space_lvl->local_used      += nzi;
5800     current_space_lvl->local_remaining -= nzi;
5801   }
5802 
5803   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5804   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5805 
5806   /* destroy list of free space and other temporary arrays */
5807   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5808 
5809   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5810   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5811 
5812   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5813   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5814   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5815 
5816 #if defined(PETSC_USE_INFO)
5817   {
5818     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5819     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5820     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5821     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5822     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5823     if (diagonal_fill) {
5824       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5825     }
5826   }
5827 #endif
5828 
5829   /* put together the new matrix */
5830   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5831   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5832   b = (Mat_SeqBAIJ*)(fact)->data;
5833   b->free_a       = PETSC_TRUE;
5834   b->free_ij      = PETSC_TRUE;
5835   b->singlemalloc = PETSC_FALSE;
5836   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5837   b->j          = bj;
5838   b->i          = bi;
5839   b->diag       = bdiag;
5840   b->free_diag  = PETSC_TRUE;
5841   b->ilen       = 0;
5842   b->imax       = 0;
5843   b->row        = isrow;
5844   b->col        = iscol;
5845   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5846   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5847   b->icol       = isicol;
5848   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5849   /* In b structure:  Free imax, ilen, old a, old j.
5850      Allocate bdiag, solve_work, new a, new j */
5851   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5852   b->maxnz = b->nz = bdiag[0]+1;
5853   fact->info.factor_mallocs    = reallocs;
5854   fact->info.fill_ratio_given  = f;
5855   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5856   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5857   PetscFunctionReturn(0);
5858 }
5859 
5860 
5861 /*
5862      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5863    except that the data structure of Mat_SeqAIJ is slightly different.
5864    Not a good example of code reuse.
5865 */
5866 #undef __FUNCT__
5867 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5868 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5869 {
5870   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5871   IS             isicol;
5872   PetscErrorCode ierr;
5873   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5874   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5875   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5876   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5877   PetscTruth     col_identity,row_identity,both_identity,flg;
5878   PetscReal      f;
5879 
5880   PetscFunctionBegin;
5881   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5882   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5883 
5884   f             = info->fill;
5885   levels        = (PetscInt)info->levels;
5886   diagonal_fill = (PetscInt)info->diagonal_fill;
5887   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5888 
5889   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5890   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5891   both_identity = (PetscTruth) (row_identity && col_identity);
5892 
5893   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5894     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5895     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5896 
5897     fact->factor = MAT_FACTOR_ILU;
5898     b            = (Mat_SeqBAIJ*)fact->data;
5899     b->row       = isrow;
5900     b->col       = iscol;
5901     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5902     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5903     b->icol      = isicol;
5904     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5905     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5906     PetscFunctionReturn(0);
5907   }
5908 
5909   /* general case perform the symbolic factorization */
5910     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5911     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5912 
5913     /* get new row pointers */
5914     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5915     ainew[0] = 0;
5916     /* don't know how many column pointers are needed so estimate */
5917     jmax = (PetscInt)(f*ai[n] + 1);
5918     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5919     /* ajfill is level of fill for each fill entry */
5920     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5921     /* fill is a linked list of nonzeros in active row */
5922     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5923     /* im is level for each filled value */
5924     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5925     /* dloc is location of diagonal in factor */
5926     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5927     dloc[0]  = 0;
5928     for (prow=0; prow<n; prow++) {
5929 
5930       /* copy prow into linked list */
5931       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5932       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5933       xi         = aj + ai[r[prow]];
5934       fill[n]    = n;
5935       fill[prow] = -1; /* marker for diagonal entry */
5936       while (nz--) {
5937 	fm  = n;
5938 	idx = ic[*xi++];
5939 	do {
5940 	  m  = fm;
5941 	  fm = fill[m];
5942 	} while (fm < idx);
5943 	fill[m]   = idx;
5944 	fill[idx] = fm;
5945 	im[idx]   = 0;
5946       }
5947 
5948       /* make sure diagonal entry is included */
5949       if (diagonal_fill && fill[prow] == -1) {
5950 	fm = n;
5951 	while (fill[fm] < prow) fm = fill[fm];
5952 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5953 	fill[fm]   = prow;
5954 	im[prow]   = 0;
5955 	nzf++;
5956 	dcount++;
5957       }
5958 
5959       nzi = 0;
5960       row = fill[n];
5961       while (row < prow) {
5962 	incrlev = im[row] + 1;
5963 	nz      = dloc[row];
5964 	xi      = ajnew  + ainew[row] + nz + 1;
5965 	flev    = ajfill + ainew[row] + nz + 1;
5966 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5967 	fm      = row;
5968 	while (nnz-- > 0) {
5969 	  idx = *xi++;
5970 	  if (*flev + incrlev > levels) {
5971 	    flev++;
5972 	    continue;
5973 	  }
5974 	  do {
5975 	    m  = fm;
5976 	    fm = fill[m];
5977 	  } while (fm < idx);
5978 	  if (fm != idx) {
5979 	    im[idx]   = *flev + incrlev;
5980 	    fill[m]   = idx;
5981 	    fill[idx] = fm;
5982 	    fm        = idx;
5983 	    nzf++;
5984 	  } else {
5985 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5986 	  }
5987 	  flev++;
5988 	}
5989 	row = fill[row];
5990 	nzi++;
5991       }
5992       /* copy new filled row into permanent storage */
5993       ainew[prow+1] = ainew[prow] + nzf;
5994       if (ainew[prow+1] > jmax) {
5995 
5996 	/* estimate how much additional space we will need */
5997 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5998 	/* just double the memory each time */
5999 	PetscInt maxadd = jmax;
6000 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
6001 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
6002 	jmax += maxadd;
6003 
6004 	/* allocate a longer ajnew and ajfill */
6005 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6006 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6007 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
6008 	ajnew = xitmp;
6009 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
6010 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6011 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
6012 	ajfill = xitmp;
6013 	reallocate++; /* count how many reallocations are needed */
6014       }
6015       xitmp       = ajnew + ainew[prow];
6016       flev        = ajfill + ainew[prow];
6017       dloc[prow]  = nzi;
6018       fm          = fill[n];
6019       while (nzf--) {
6020 	*xitmp++ = fm;
6021 	*flev++ = im[fm];
6022 	fm      = fill[fm];
6023       }
6024       /* make sure row has diagonal entry */
6025       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6026 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6027     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6028       }
6029     }
6030     ierr = PetscFree(ajfill);CHKERRQ(ierr);
6031     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6032     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6033     ierr = PetscFree(fill);CHKERRQ(ierr);
6034     ierr = PetscFree(im);CHKERRQ(ierr);
6035 
6036 #if defined(PETSC_USE_INFO)
6037     {
6038       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6039       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6040       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6041       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6042       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6043       if (diagonal_fill) {
6044 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6045       }
6046     }
6047 #endif
6048 
6049     /* put together the new matrix */
6050     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6051     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6052     b    = (Mat_SeqBAIJ*)fact->data;
6053     b->free_a       = PETSC_TRUE;
6054     b->free_ij      = PETSC_TRUE;
6055     b->singlemalloc = PETSC_FALSE;
6056     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
6057     b->j          = ajnew;
6058     b->i          = ainew;
6059     for (i=0; i<n; i++) dloc[i] += ainew[i];
6060     b->diag       = dloc;
6061     b->free_diag  = PETSC_TRUE;
6062     b->ilen       = 0;
6063     b->imax       = 0;
6064     b->row        = isrow;
6065     b->col        = iscol;
6066     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6067     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6068     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6069     b->icol       = isicol;
6070     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
6071     /* In b structure:  Free imax, ilen, old a, old j.
6072        Allocate dloc, solve_work, new a, new j */
6073     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
6074     b->maxnz          = b->nz = ainew[n];
6075 
6076     fact->info.factor_mallocs    = reallocate;
6077     fact->info.fill_ratio_given  = f;
6078     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6079 
6080   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
6081   PetscFunctionReturn(0);
6082 }
6083 
6084 #undef __FUNCT__
6085 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6086 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6087 {
6088   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6089   /* int i,*AJ=a->j,nz=a->nz; */
6090   PetscFunctionBegin;
6091   /* Undo Column scaling */
6092 /*    while (nz--) { */
6093 /*      AJ[i] = AJ[i]/4; */
6094 /*    } */
6095   /* This should really invoke a push/pop logic, but we don't have that yet. */
6096   A->ops->setunfactored = PETSC_NULL;
6097   PetscFunctionReturn(0);
6098 }
6099 
6100 #undef __FUNCT__
6101 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6102 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6103 {
6104   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6105   PetscInt       *AJ=a->j,nz=a->nz;
6106   unsigned short *aj=(unsigned short *)AJ;
6107   PetscFunctionBegin;
6108   /* Is this really necessary? */
6109   while (nz--) {
6110     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6111   }
6112   A->ops->setunfactored = PETSC_NULL;
6113   PetscFunctionReturn(0);
6114 }
6115 
6116 
6117