xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 2ae9f97c2b3342c7fa240ee3f9868a1cbf4cbe21)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode    ierr;
18   PetscInt          i,nz;
19   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20   const MatScalar   *aa=a->a,*v;
21   PetscScalar       s1,*x;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode    ierr;
64   PetscInt          i,nz,idx,idt,oidx;
65   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66   const MatScalar   *aa=a->a,*v;
67   PetscScalar       s1,s2,x1,x2,*x;
68   const PetscScalar *b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode    ierr;
123   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt          nz,idx,idt,j,i,oidx;
125   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126   const MatScalar   *aa=a->a,*v;
127   PetscScalar       s1,s2,x1,x2,*x;
128   const PetscScalar *b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode    ierr;
182   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183   PetscInt          i,nz,idx,idt,oidx;
184   const MatScalar   *aa=a->a,*v;
185   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186   const PetscScalar *b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode    ierr;
244   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt          nz,idx,idt,j,i,oidx;
246   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247   const MatScalar   *aa=a->a,*v;
248   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249   const PetscScalar *b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode    ierr;
306   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307   PetscInt          i,nz,idx,idt,oidx;
308   const MatScalar   *aa=a->a,*v;
309   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310   const PetscScalar *b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode    ierr;
371   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt          nz,idx,idt,j,i,oidx;
373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374   const MatScalar   *aa=a->a,*v;
375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376   const PetscScalar *b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode    ierr;
436   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437   PetscInt          i,nz,idx,idt,oidx;
438   const MatScalar   *aa=a->a,*v;
439   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440   const PetscScalar *b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   const MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509   const PetscScalar    *b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode    ierr;
573   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574   PetscInt          i,nz,idx,idt,oidx;
575   const MatScalar   *aa=a->a,*v;
576   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577   const PetscScalar *b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode    ierr;
647   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt          nz,idx,idt,j,i,oidx;
649   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650   const MatScalar   *aa=a->a,*v;
651   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652   const PetscScalar *b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode    ierr;
721   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722   PetscInt          i,nz,idx,idt,oidx;
723   const MatScalar   *aa=a->a,*v;
724   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725   const PetscScalar *b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode    ierr;
797   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt          nz,idx,idt,j,i,oidx;
799   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800   const MatScalar   *aa=a->a,*v;
801   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802   const PetscScalar *b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873   IS                iscol=a->col,isrow=a->row;
874   PetscErrorCode    ierr;
875   const PetscInt    *r,*c,*rout,*cout;
876   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877   PetscInt          i,nz;
878   const MatScalar   *aa=a->a,*v;
879   PetscScalar       s1,*x,*t;
880   const PetscScalar *b;
881 
882   PetscFunctionBegin;
883   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
884   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
885   t  = a->solve_work;
886 
887   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
888   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
889 
890   /* copy the b into temp work space according to permutation */
891   for (i=0; i<n; i++) {
892     t[i] = b[c[i]];
893   }
894 
895   /* forward solve the U^T */
896   for (i=0; i<n; i++) {
897 
898     v     = aa + diag[i];
899     /* multiply by the inverse of the block diagonal */
900     s1    = (*v++)*t[i];
901     vi    = aj + diag[i] + 1;
902     nz    = ai[i+1] - diag[i] - 1;
903     while (nz--) {
904       t[*vi++]  -= (*v++)*s1;
905     }
906     t[i]   = s1;
907   }
908   /* backward solve the L^T */
909   for (i=n-1; i>=0; i--){
910     v    = aa + diag[i] - 1;
911     vi   = aj + diag[i] - 1;
912     nz   = diag[i] - ai[i];
913     s1   = t[i];
914     while (nz--) {
915       t[*vi--]   -=  (*v--)*s1;
916     }
917   }
918 
919   /* copy t into x according to permutation */
920   for (i=0; i<n; i++) {
921     x[r[i]]   = t[i];
922   }
923 
924   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
925   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
926   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
927   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
929   PetscFunctionReturn(0);
930 }
931 
932 #undef __FUNCT__
933 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
934 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935 {
936   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937   IS                iscol=a->col,isrow=a->row;
938   PetscErrorCode    ierr;
939   const PetscInt    *r,*c,*rout,*cout;
940   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942   const MatScalar   *aa=a->a,*v;
943   PetscScalar       s1,s2,x1,x2,*x,*t;
944   const PetscScalar *b;
945 
946   PetscFunctionBegin;
947   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
948   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
949   t  = a->solve_work;
950 
951   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
952   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
953 
954   /* copy the b into temp work space according to permutation */
955   ii = 0;
956   for (i=0; i<n; i++) {
957     ic      = 2*c[i];
958     t[ii]   = b[ic];
959     t[ii+1] = b[ic+1];
960     ii += 2;
961   }
962 
963   /* forward solve the U^T */
964   idx = 0;
965   for (i=0; i<n; i++) {
966 
967     v     = aa + 4*diag[i];
968     /* multiply by the inverse of the block diagonal */
969     x1    = t[idx];   x2 = t[1+idx];
970     s1 = v[0]*x1  +  v[1]*x2;
971     s2 = v[2]*x1  +  v[3]*x2;
972     v += 4;
973 
974     vi    = aj + diag[i] + 1;
975     nz    = ai[i+1] - diag[i] - 1;
976     while (nz--) {
977       oidx = 2*(*vi++);
978       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980       v  += 4;
981     }
982     t[idx]   = s1;t[1+idx] = s2;
983     idx += 2;
984   }
985   /* backward solve the L^T */
986   for (i=n-1; i>=0; i--){
987     v    = aa + 4*diag[i] - 4;
988     vi   = aj + diag[i] - 1;
989     nz   = diag[i] - ai[i];
990     idt  = 2*i;
991     s1 = t[idt];  s2 = t[1+idt];
992     while (nz--) {
993       idx   = 2*(*vi--);
994       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996       v -= 4;
997     }
998   }
999 
1000   /* copy t into x according to permutation */
1001   ii = 0;
1002   for (i=0; i<n; i++) {
1003     ir      = 2*r[i];
1004     x[ir]   = t[ii];
1005     x[ir+1] = t[ii+1];
1006     ii += 2;
1007   }
1008 
1009   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1010   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1012   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1019 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020 {
1021   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1022   PetscErrorCode    ierr;
1023   IS                iscol=a->col,isrow=a->row;
1024   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025   const PetscInt    *r,*c,*rout,*cout;
1026   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028   const MatScalar   *aa=a->a,*v;
1029   PetscScalar       s1,s2,x1,x2,*x,*t;
1030   const PetscScalar *b;
1031 
1032   PetscFunctionBegin;
1033   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1035   t = a->solve_work;
1036 
1037   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1038   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1039 
1040   /* copy b into temp work space according to permutation */
1041   for(i=0;i<n;i++){
1042     ii = bs*i; ic = bs*c[i];
1043     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044   }
1045 
1046   /* forward solve the U^T */
1047   idx = 0;
1048   for (i=0; i<n; i++) {
1049     v     = aa + bs2*diag[i];
1050     /* multiply by the inverse of the block diagonal */
1051     x1 = t[idx];   x2 = t[1+idx];
1052     s1 = v[0]*x1  +  v[1]*x2;
1053     s2 = v[2]*x1  +  v[3]*x2;
1054     v -= bs2;
1055 
1056     vi    = aj + diag[i] - 1;
1057     nz    = diag[i] - diag[i+1] - 1;
1058     for(j=0;j>-nz;j--){
1059       oidx = bs*vi[j];
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1061       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1062       v  -= bs2;
1063     }
1064     t[idx]   = s1;t[1+idx] = s2;
1065     idx += bs;
1066   }
1067   /* backward solve the L^T */
1068   for (i=n-1; i>=0; i--){
1069     v    = aa + bs2*ai[i];
1070     vi   = aj + ai[i];
1071     nz   = ai[i+1] - ai[i];
1072     idt  = bs*i;
1073     s1   = t[idt];  s2 = t[1+idt];
1074     for(j=0;j<nz;j++){
1075       idx   = bs*vi[j];
1076       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1077       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1078       v += bs2;
1079     }
1080   }
1081 
1082   /* copy t into x according to permutation */
1083   for(i=0;i<n;i++){
1084     ii = bs*i;  ir = bs*r[i];
1085     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1086   }
1087 
1088   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1089   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1092   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 #undef __FUNCT__
1097 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1098 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099 {
1100   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101   IS                iscol=a->col,isrow=a->row;
1102   PetscErrorCode    ierr;
1103   const PetscInt    *r,*c,*rout,*cout;
1104   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106   const MatScalar   *aa=a->a,*v;
1107   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108   const PetscScalar *b;
1109 
1110   PetscFunctionBegin;
1111   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113   t  = a->solve_work;
1114 
1115   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1116   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1117 
1118   /* copy the b into temp work space according to permutation */
1119   ii = 0;
1120   for (i=0; i<n; i++) {
1121     ic      = 3*c[i];
1122     t[ii]   = b[ic];
1123     t[ii+1] = b[ic+1];
1124     t[ii+2] = b[ic+2];
1125     ii += 3;
1126   }
1127 
1128   /* forward solve the U^T */
1129   idx = 0;
1130   for (i=0; i<n; i++) {
1131 
1132     v     = aa + 9*diag[i];
1133     /* multiply by the inverse of the block diagonal */
1134     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138     v += 9;
1139 
1140     vi    = aj + diag[i] + 1;
1141     nz    = ai[i+1] - diag[i] - 1;
1142     while (nz--) {
1143       oidx = 3*(*vi++);
1144       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147       v  += 9;
1148     }
1149     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150     idx += 3;
1151   }
1152   /* backward solve the L^T */
1153   for (i=n-1; i>=0; i--){
1154     v    = aa + 9*diag[i] - 9;
1155     vi   = aj + diag[i] - 1;
1156     nz   = diag[i] - ai[i];
1157     idt  = 3*i;
1158     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159     while (nz--) {
1160       idx   = 3*(*vi--);
1161       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164       v -= 9;
1165     }
1166   }
1167 
1168   /* copy t into x according to permutation */
1169   ii = 0;
1170   for (i=0; i<n; i++) {
1171     ir      = 3*r[i];
1172     x[ir]   = t[ii];
1173     x[ir+1] = t[ii+1];
1174     x[ir+2] = t[ii+2];
1175     ii += 3;
1176   }
1177 
1178   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1179   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1188 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189 {
1190   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1191   PetscErrorCode    ierr;
1192   IS                iscol=a->col,isrow=a->row;
1193   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194   const PetscInt    *r,*c,*rout,*cout;
1195   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197   const MatScalar   *aa=a->a,*v;
1198   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199   const PetscScalar *b;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* copy b into temp work space according to permutation */
1210   for(i=0;i<n;i++){
1211     ii = bs*i; ic = bs*c[i];
1212     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213   }
1214 
1215   /* forward solve the U^T */
1216   idx = 0;
1217   for (i=0; i<n; i++) {
1218     v     = aa + bs2*diag[i];
1219     /* multiply by the inverse of the block diagonal */
1220     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1221     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1222     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1223     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1224     v -= bs2;
1225 
1226     vi    = aj + diag[i] - 1;
1227     nz    = diag[i] - diag[i+1] - 1;
1228     for(j=0;j>-nz;j--){
1229       oidx = bs*vi[j];
1230       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1231       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1232       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233       v  -= bs2;
1234     }
1235     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1236     idx += bs;
1237   }
1238   /* backward solve the L^T */
1239   for (i=n-1; i>=0; i--){
1240     v    = aa + bs2*ai[i];
1241     vi   = aj + ai[i];
1242     nz   = ai[i+1] - ai[i];
1243     idt  = bs*i;
1244     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1245     for(j=0;j<nz;j++){
1246       idx   = bs*vi[j];
1247       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1248       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1249       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250       v += bs2;
1251     }
1252   }
1253 
1254   /* copy t into x according to permutation */
1255   for(i=0;i<n;i++){
1256     ii = bs*i;  ir = bs*r[i];
1257     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258   }
1259 
1260   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1261   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1263   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1270 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271 {
1272   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273   IS                iscol=a->col,isrow=a->row;
1274   PetscErrorCode    ierr;
1275   const PetscInt    *r,*c,*rout,*cout;
1276   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278   const MatScalar   *aa=a->a,*v;
1279   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280   const PetscScalar *b;
1281 
1282   PetscFunctionBegin;
1283   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1284   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1285   t  = a->solve_work;
1286 
1287   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1288   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1289 
1290   /* copy the b into temp work space according to permutation */
1291   ii = 0;
1292   for (i=0; i<n; i++) {
1293     ic      = 4*c[i];
1294     t[ii]   = b[ic];
1295     t[ii+1] = b[ic+1];
1296     t[ii+2] = b[ic+2];
1297     t[ii+3] = b[ic+3];
1298     ii += 4;
1299   }
1300 
1301   /* forward solve the U^T */
1302   idx = 0;
1303   for (i=0; i<n; i++) {
1304 
1305     v     = aa + 16*diag[i];
1306     /* multiply by the inverse of the block diagonal */
1307     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312     v += 16;
1313 
1314     vi    = aj + diag[i] + 1;
1315     nz    = ai[i+1] - diag[i] - 1;
1316     while (nz--) {
1317       oidx = 4*(*vi++);
1318       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322       v  += 16;
1323     }
1324     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325     idx += 4;
1326   }
1327   /* backward solve the L^T */
1328   for (i=n-1; i>=0; i--){
1329     v    = aa + 16*diag[i] - 16;
1330     vi   = aj + diag[i] - 1;
1331     nz   = diag[i] - ai[i];
1332     idt  = 4*i;
1333     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334     while (nz--) {
1335       idx   = 4*(*vi--);
1336       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340       v -= 16;
1341     }
1342   }
1343 
1344   /* copy t into x according to permutation */
1345   ii = 0;
1346   for (i=0; i<n; i++) {
1347     ir      = 4*r[i];
1348     x[ir]   = t[ii];
1349     x[ir+1] = t[ii+1];
1350     x[ir+2] = t[ii+2];
1351     x[ir+3] = t[ii+3];
1352     ii += 4;
1353   }
1354 
1355   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1356   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1359   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 #undef __FUNCT__
1364 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1365 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366 {
1367   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1368   PetscErrorCode    ierr;
1369   IS                iscol=a->col,isrow=a->row;
1370   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371   const PetscInt    *r,*c,*rout,*cout;
1372   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374   const MatScalar   *aa=a->a,*v;
1375   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376   const PetscScalar *b;
1377 
1378   PetscFunctionBegin;
1379   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1381   t = a->solve_work;
1382 
1383   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1384   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1385 
1386   /* copy b into temp work space according to permutation */
1387   for(i=0;i<n;i++){
1388     ii = bs*i; ic = bs*c[i];
1389     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390   }
1391 
1392   /* forward solve the U^T */
1393   idx = 0;
1394   for (i=0; i<n; i++) {
1395     v     = aa + bs2*diag[i];
1396     /* multiply by the inverse of the block diagonal */
1397     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1398     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1399     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1400     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1401     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402     v -= bs2;
1403 
1404     vi    = aj + diag[i] - 1;
1405     nz    = diag[i] - diag[i+1] - 1;
1406     for(j=0;j>-nz;j--){
1407       oidx = bs*vi[j];
1408       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1409       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1410       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412       v  -= bs2;
1413     }
1414     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1415     idx += bs;
1416   }
1417   /* backward solve the L^T */
1418   for (i=n-1; i>=0; i--){
1419     v    = aa + bs2*ai[i];
1420     vi   = aj + ai[i];
1421     nz   = ai[i+1] - ai[i];
1422     idt  = bs*i;
1423     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1424     for(j=0;j<nz;j++){
1425       idx   = bs*vi[j];
1426       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1427       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1428       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1429       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1430       v += bs2;
1431     }
1432   }
1433 
1434   /* copy t into x according to permutation */
1435   for(i=0;i<n;i++){
1436     ii = bs*i;  ir = bs*r[i];
1437     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1438   }
1439 
1440   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1450 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453   IS                iscol=a->col,isrow=a->row;
1454   PetscErrorCode    ierr;
1455   const PetscInt    *r,*c,*rout,*cout;
1456   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458   const MatScalar   *aa=a->a,*v;
1459   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460   const PetscScalar *b;
1461 
1462   PetscFunctionBegin;
1463   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1464   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1465   t  = a->solve_work;
1466 
1467   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1468   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1469 
1470   /* copy the b into temp work space according to permutation */
1471   ii = 0;
1472   for (i=0; i<n; i++) {
1473     ic      = 5*c[i];
1474     t[ii]   = b[ic];
1475     t[ii+1] = b[ic+1];
1476     t[ii+2] = b[ic+2];
1477     t[ii+3] = b[ic+3];
1478     t[ii+4] = b[ic+4];
1479     ii += 5;
1480   }
1481 
1482   /* forward solve the U^T */
1483   idx = 0;
1484   for (i=0; i<n; i++) {
1485 
1486     v     = aa + 25*diag[i];
1487     /* multiply by the inverse of the block diagonal */
1488     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494     v += 25;
1495 
1496     vi    = aj + diag[i] + 1;
1497     nz    = ai[i+1] - diag[i] - 1;
1498     while (nz--) {
1499       oidx = 5*(*vi++);
1500       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505       v  += 25;
1506     }
1507     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508     idx += 5;
1509   }
1510   /* backward solve the L^T */
1511   for (i=n-1; i>=0; i--){
1512     v    = aa + 25*diag[i] - 25;
1513     vi   = aj + diag[i] - 1;
1514     nz   = diag[i] - ai[i];
1515     idt  = 5*i;
1516     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517     while (nz--) {
1518       idx   = 5*(*vi--);
1519       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524       v -= 25;
1525     }
1526   }
1527 
1528   /* copy t into x according to permutation */
1529   ii = 0;
1530   for (i=0; i<n; i++) {
1531     ir      = 5*r[i];
1532     x[ir]   = t[ii];
1533     x[ir+1] = t[ii+1];
1534     x[ir+2] = t[ii+2];
1535     x[ir+3] = t[ii+3];
1536     x[ir+4] = t[ii+4];
1537     ii += 5;
1538   }
1539 
1540   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1541   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1550 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553   PetscErrorCode    ierr;
1554   IS                iscol=a->col,isrow=a->row;
1555   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556   const PetscInt    *r,*c,*rout,*cout;
1557   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559   const MatScalar   *aa=a->a,*v;
1560   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561   const PetscScalar *b;
1562 
1563   PetscFunctionBegin;
1564   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1565   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1566   t = a->solve_work;
1567 
1568   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1569   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1570 
1571   /* copy b into temp work space according to permutation */
1572   for(i=0;i<n;i++){
1573     ii = bs*i; ic = bs*c[i];
1574     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575     t[ii+4] = b[ic+4];
1576   }
1577 
1578   /* forward solve the U^T */
1579   idx = 0;
1580   for (i=0; i<n; i++) {
1581     v     = aa + bs2*diag[i];
1582     /* multiply by the inverse of the block diagonal */
1583     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1585     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1586     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589     v -= bs2;
1590 
1591     vi    = aj + diag[i] - 1;
1592     nz    = diag[i] - diag[i+1] - 1;
1593     for(j=0;j>-nz;j--){
1594       oidx = bs*vi[j];
1595       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1596       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1597       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600       v  -= bs2;
1601     }
1602     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1603     idx += bs;
1604   }
1605   /* backward solve the L^T */
1606   for (i=n-1; i>=0; i--){
1607     v    = aa + bs2*ai[i];
1608     vi   = aj + ai[i];
1609     nz   = ai[i+1] - ai[i];
1610     idt  = bs*i;
1611     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1612     for(j=0;j<nz;j++){
1613       idx   = bs*vi[j];
1614       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1615       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1616       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619       v += bs2;
1620     }
1621   }
1622 
1623   /* copy t into x according to permutation */
1624   for(i=0;i<n;i++){
1625     ii = bs*i;  ir = bs*r[i];
1626     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1627     x[ir+4] = t[ii+4];
1628   }
1629 
1630   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1631   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1633   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1634   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1640 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641 {
1642   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643   IS                iscol=a->col,isrow=a->row;
1644   PetscErrorCode    ierr;
1645   const PetscInt    *r,*c,*rout,*cout;
1646   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648   const MatScalar   *aa=a->a,*v;
1649   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650   const PetscScalar *b;
1651 
1652   PetscFunctionBegin;
1653   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655   t  = a->solve_work;
1656 
1657   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1658   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1659 
1660   /* copy the b into temp work space according to permutation */
1661   ii = 0;
1662   for (i=0; i<n; i++) {
1663     ic      = 6*c[i];
1664     t[ii]   = b[ic];
1665     t[ii+1] = b[ic+1];
1666     t[ii+2] = b[ic+2];
1667     t[ii+3] = b[ic+3];
1668     t[ii+4] = b[ic+4];
1669     t[ii+5] = b[ic+5];
1670     ii += 6;
1671   }
1672 
1673   /* forward solve the U^T */
1674   idx = 0;
1675   for (i=0; i<n; i++) {
1676 
1677     v     = aa + 36*diag[i];
1678     /* multiply by the inverse of the block diagonal */
1679     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680     x6    = t[5+idx];
1681     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687     v += 36;
1688 
1689     vi    = aj + diag[i] + 1;
1690     nz    = ai[i+1] - diag[i] - 1;
1691     while (nz--) {
1692       oidx = 6*(*vi++);
1693       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699       v  += 36;
1700     }
1701     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702     t[5+idx] = s6;
1703     idx += 6;
1704   }
1705   /* backward solve the L^T */
1706   for (i=n-1; i>=0; i--){
1707     v    = aa + 36*diag[i] - 36;
1708     vi   = aj + diag[i] - 1;
1709     nz   = diag[i] - ai[i];
1710     idt  = 6*i;
1711     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712     s6 = t[5+idt];
1713     while (nz--) {
1714       idx   = 6*(*vi--);
1715       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721       v -= 36;
1722     }
1723   }
1724 
1725   /* copy t into x according to permutation */
1726   ii = 0;
1727   for (i=0; i<n; i++) {
1728     ir      = 6*r[i];
1729     x[ir]   = t[ii];
1730     x[ir+1] = t[ii+1];
1731     x[ir+2] = t[ii+2];
1732     x[ir+3] = t[ii+3];
1733     x[ir+4] = t[ii+4];
1734     x[ir+5] = t[ii+5];
1735     ii += 6;
1736   }
1737 
1738   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749 {
1750   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751   PetscErrorCode    ierr;
1752   IS                iscol=a->col,isrow=a->row;
1753   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754   const PetscInt    *r,*c,*rout,*cout;
1755   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757   const MatScalar   *aa=a->a,*v;
1758   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759   const PetscScalar *b;
1760 
1761   PetscFunctionBegin;
1762   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1764   t = a->solve_work;
1765 
1766   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1767   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1768 
1769   /* copy b into temp work space according to permutation */
1770   for(i=0;i<n;i++){
1771     ii = bs*i; ic = bs*c[i];
1772     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1774   }
1775 
1776   /* forward solve the U^T */
1777   idx = 0;
1778   for (i=0; i<n; i++) {
1779     v     = aa + bs2*diag[i];
1780     /* multiply by the inverse of the block diagonal */
1781     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782     x6    = t[5+idx];
1783     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1784     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1785     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789     v -= bs2;
1790 
1791     vi    = aj + diag[i] - 1;
1792     nz    = diag[i] - diag[i+1] - 1;
1793     for(j=0;j>-nz;j--){
1794       oidx = bs*vi[j];
1795       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1796       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1797       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801       v  -= bs2;
1802     }
1803     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1804     t[5+idx] = s6;
1805     idx += bs;
1806   }
1807   /* backward solve the L^T */
1808   for (i=n-1; i>=0; i--){
1809     v    = aa + bs2*ai[i];
1810     vi   = aj + ai[i];
1811     nz   = ai[i+1] - ai[i];
1812     idt  = bs*i;
1813     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1814     s6   = t[5+idt];
1815    for(j=0;j<nz;j++){
1816       idx   = bs*vi[j];
1817       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1818       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1819       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823       v += bs2;
1824     }
1825   }
1826 
1827   /* copy t into x according to permutation */
1828   for(i=0;i<n;i++){
1829     ii = bs*i;  ir = bs*r[i];
1830     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1831     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1832   }
1833 
1834   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1837   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1838   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1839   PetscFunctionReturn(0);
1840 }
1841 
1842 #undef __FUNCT__
1843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845 {
1846   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847   IS                iscol=a->col,isrow=a->row;
1848   PetscErrorCode    ierr;
1849   const PetscInt    *r,*c,*rout,*cout;
1850   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852   const MatScalar   *aa=a->a,*v;
1853   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854   const PetscScalar *b;
1855 
1856   PetscFunctionBegin;
1857   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1858   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859   t  = a->solve_work;
1860 
1861   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1862   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1863 
1864   /* copy the b into temp work space according to permutation */
1865   ii = 0;
1866   for (i=0; i<n; i++) {
1867     ic      = 7*c[i];
1868     t[ii]   = b[ic];
1869     t[ii+1] = b[ic+1];
1870     t[ii+2] = b[ic+2];
1871     t[ii+3] = b[ic+3];
1872     t[ii+4] = b[ic+4];
1873     t[ii+5] = b[ic+5];
1874     t[ii+6] = b[ic+6];
1875     ii += 7;
1876   }
1877 
1878   /* forward solve the U^T */
1879   idx = 0;
1880   for (i=0; i<n; i++) {
1881 
1882     v     = aa + 49*diag[i];
1883     /* multiply by the inverse of the block diagonal */
1884     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885     x6    = t[5+idx]; x7 = t[6+idx];
1886     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893     v += 49;
1894 
1895     vi    = aj + diag[i] + 1;
1896     nz    = ai[i+1] - diag[i] - 1;
1897     while (nz--) {
1898       oidx = 7*(*vi++);
1899       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906       v  += 49;
1907     }
1908     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909     t[5+idx] = s6;t[6+idx] = s7;
1910     idx += 7;
1911   }
1912   /* backward solve the L^T */
1913   for (i=n-1; i>=0; i--){
1914     v    = aa + 49*diag[i] - 49;
1915     vi   = aj + diag[i] - 1;
1916     nz   = diag[i] - ai[i];
1917     idt  = 7*i;
1918     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919     s6 = t[5+idt];s7 = t[6+idt];
1920     while (nz--) {
1921       idx   = 7*(*vi--);
1922       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929       v -= 49;
1930     }
1931   }
1932 
1933   /* copy t into x according to permutation */
1934   ii = 0;
1935   for (i=0; i<n; i++) {
1936     ir      = 7*r[i];
1937     x[ir]   = t[ii];
1938     x[ir+1] = t[ii+1];
1939     x[ir+2] = t[ii+2];
1940     x[ir+3] = t[ii+3];
1941     x[ir+4] = t[ii+4];
1942     x[ir+5] = t[ii+5];
1943     x[ir+6] = t[ii+6];
1944     ii += 7;
1945   }
1946 
1947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 #undef __FUNCT__
1955 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1956 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957 {
1958   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1959   PetscErrorCode    ierr;
1960   IS                iscol=a->col,isrow=a->row;
1961   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962   const PetscInt    *r,*c,*rout,*cout;
1963   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965   const MatScalar   *aa=a->a,*v;
1966   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967   const PetscScalar *b;
1968 
1969   PetscFunctionBegin;
1970   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1971   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1972   t = a->solve_work;
1973 
1974   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1975   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1976 
1977   /* copy b into temp work space according to permutation */
1978   for(i=0;i<n;i++){
1979     ii = bs*i; ic = bs*c[i];
1980     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1982   }
1983 
1984   /* forward solve the U^T */
1985   idx = 0;
1986   for (i=0; i<n; i++) {
1987     v     = aa + bs2*diag[i];
1988     /* multiply by the inverse of the block diagonal */
1989     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990     x6    = t[5+idx]; x7 = t[6+idx];
1991     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1992     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998     v -= bs2;
1999 
2000     vi    = aj + diag[i] - 1;
2001     nz    = diag[i] - diag[i+1] - 1;
2002     for(j=0;j>-nz;j--){
2003       oidx = bs*vi[j];
2004       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2005       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011       v  -= bs2;
2012     }
2013     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2014     t[5+idx] = s6;  t[6+idx] = s7;
2015     idx += bs;
2016   }
2017   /* backward solve the L^T */
2018   for (i=n-1; i>=0; i--){
2019     v    = aa + bs2*ai[i];
2020     vi   = aj + ai[i];
2021     nz   = ai[i+1] - ai[i];
2022     idt  = bs*i;
2023     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2024     s6   = t[5+idt];  s7 = t[6+idt];
2025    for(j=0;j<nz;j++){
2026       idx   = bs*vi[j];
2027       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2028       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034       v += bs2;
2035     }
2036   }
2037 
2038   /* copy t into x according to permutation */
2039   for(i=0;i<n;i++){
2040     ii = bs*i;  ir = bs*r[i];
2041     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2042     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2043   }
2044 
2045   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2046   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2049   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /* ----------------------------------------------------------- */
2054 #undef __FUNCT__
2055 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2056 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057 {
2058   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2059   IS                iscol=a->col,isrow=a->row;
2060   PetscErrorCode    ierr;
2061   const PetscInt    *r,*c,*rout,*cout;
2062   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063   PetscInt          i,nz;
2064   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065   const MatScalar   *aa=a->a,*v;
2066   PetscScalar       *x,*s,*t,*ls;
2067   const PetscScalar *b;
2068 
2069   PetscFunctionBegin;
2070   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2071   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2072   t  = a->solve_work;
2073 
2074   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2075   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2076 
2077   /* forward solve the lower triangular */
2078   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2079   for (i=1; i<n; i++) {
2080     v   = aa + bs2*ai[i];
2081     vi  = aj + ai[i];
2082     nz  = a->diag[i] - ai[i];
2083     s = t + bs*i;
2084     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2085     while (nz--) {
2086       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087       v += bs2;
2088     }
2089   }
2090   /* backward solve the upper triangular */
2091   ls = a->solve_work + A->cmap->n;
2092   for (i=n-1; i>=0; i--){
2093     v   = aa + bs2*(a->diag[i] + 1);
2094     vi  = aj + a->diag[i] + 1;
2095     nz  = ai[i+1] - a->diag[i] - 1;
2096     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2097     while (nz--) {
2098       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099       v += bs2;
2100     }
2101     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2103   }
2104 
2105   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2106   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2107   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2109   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2110   PetscFunctionReturn(0);
2111 }
2112 
2113 /* ----------------------------------------------------------- */
2114 #undef __FUNCT__
2115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2116 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117 {
2118   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2119   IS                iscol=a->col,isrow=a->row;
2120   PetscErrorCode    ierr;
2121   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122   PetscInt          i,nz,j;
2123   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124   const MatScalar   *aa=a->a,*v;
2125   PetscScalar       *x,*t,*ls;
2126   const PetscScalar *b;
2127   PetscFunctionBegin;
2128   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2130   t    = a->solve_work;
2131 
2132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2134 
2135   /* copy the b into temp work space according to permutation */
2136   for (i=0; i<n; i++) {
2137     for (j=0; j<bs; j++) {
2138       t[i*bs+j] = b[c[i]*bs+j];
2139     }
2140   }
2141 
2142 
2143   /* forward solve the upper triangular transpose */
2144   ls = a->solve_work + A->cmap->n;
2145   for (i=0; i<n; i++){
2146     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2147     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148     v   = aa + bs2*(a->diag[i] + 1);
2149     vi  = aj + a->diag[i] + 1;
2150     nz  = ai[i+1] - a->diag[i] - 1;
2151     while (nz--) {
2152       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153       v += bs2;
2154     }
2155   }
2156 
2157   /* backward solve the lower triangular transpose */
2158   for (i=n-1; i>=0; i--) {
2159     v   = aa + bs2*ai[i];
2160     vi  = aj + ai[i];
2161     nz  = a->diag[i] - ai[i];
2162     while (nz--) {
2163       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164       v += bs2;
2165     }
2166   }
2167 
2168   /* copy t into x according to permutation */
2169   for (i=0; i<n; i++) {
2170     for (j=0; j<bs; j++) {
2171       x[bs*r[i]+j]   = t[bs*i+j];
2172     }
2173   }
2174 
2175   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2176   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2177   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2178   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2179   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 #undef __FUNCT__
2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186 {
2187   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188   IS                iscol=a->col,isrow=a->row;
2189   PetscErrorCode    ierr;
2190   const PetscInt    *r,*c,*rout,*cout;
2191   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192   PetscInt          i,j,nz;
2193   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2194   const MatScalar   *aa=a->a,*v;
2195   PetscScalar       *x,*t,*ls;
2196   const PetscScalar *b;
2197 
2198   PetscFunctionBegin;
2199   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2200   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2201   t    = a->solve_work;
2202 
2203   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2204   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2205 
2206   /* copy the b into temp work space according to permutation */
2207   for (i=0; i<n; i++) {
2208     for (j=0; j<bs; j++) {
2209       t[i*bs+j] = b[c[i]*bs+j];
2210     }
2211   }
2212 
2213 
2214   /* forward solve the upper triangular transpose */
2215   ls = a->solve_work + A->cmap->n;
2216   for (i=0; i<n; i++){
2217     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2218     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219     v   = aa + bs2*(diag[i] - 1);
2220     vi  = aj + diag[i] - 1;
2221     nz  = diag[i] - diag[i+1] - 1;
2222     for(j=0;j>-nz;j--){
2223       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224       v -= bs2;
2225     }
2226   }
2227 
2228   /* backward solve the lower triangular transpose */
2229   for (i=n-1; i>=0; i--) {
2230     v   = aa + bs2*ai[i];
2231     vi  = aj + ai[i];
2232     nz  = ai[i+1] - ai[i];
2233     for(j=0;j<nz;j++){
2234       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235       v += bs2;
2236     }
2237   }
2238 
2239   /* copy t into x according to permutation */
2240   for (i=0; i<n; i++) {
2241     for (j=0; j<bs; j++) {
2242       x[bs*r[i]+j]   = t[bs*i+j];
2243     }
2244   }
2245 
2246   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2247   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2248   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2250   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2251   PetscFunctionReturn(0);
2252 }
2253 
2254 /* bs = 15 for PFLOTRAN */
2255 
2256 #undef __FUNCT__
2257 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering"
2258 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx)
2259 {
2260   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2261   PetscErrorCode    ierr;
2262   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263   PetscInt          i,nz,idx,idt,idc,m;
2264   const MatScalar   *aa=a->a,*v;
2265   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267   PetscScalar       *x,*t;
2268   const PetscScalar *b;
2269 
2270   PetscFunctionBegin;
2271   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2273   t  = a->solve_work;
2274 
2275   /* forward solve the lower triangular */
2276   idx    = 0;
2277   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2278   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2279   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2280 
2281   for (i=1; i<n; i++) {
2282     v     = aa + bs2*ai[i];
2283     vi    = aj + ai[i];
2284     nz    = ai[i+1] - ai[i];
2285     idx   = bs*i;
2286     s1   = b[idx];    s2  = b[1+idx];  s3  = b[2+idx];  s4  = b[3+idx];  s5  = b[4+idx];
2287     s6   = b[5+idx];  s7  = b[6+idx];  s8  = b[7+idx];  s9  = b[8+idx];  s10 = b[9+idx];
2288     s11  = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx];
2289     for(m=0;m<nz;m++){
2290       idx   = bs*vi[m];
2291       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2292       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2293       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2294 
2295       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2296       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2297       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2298       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2299       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2300       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2301       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2302       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2303       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2304       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2305       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2306       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2307       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2308       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2309       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2310 
2311       v += bs2;
2312     }
2313     idx = bs*i;
2314     t[idx]    = s1;  t[1+idx]  = s2;  t[2+idx]  = s3;  t[3+idx]  = s4;  t[4+idx]  = s5;
2315     t[5+idx]  = s6;  t[6+idx]  = s7;  t[7+idx]  = s8;  t[8+idx]  = s9;  t[9+idx]  = s10;
2316     t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15;
2317 
2318   }
2319   /* backward solve the upper triangular */
2320   for (i=n-1; i>=0; i--){
2321     v    = aa + bs2*(adiag[i+1]+1);
2322     vi   = aj + adiag[i+1]+1;
2323     nz   = adiag[i] - adiag[i+1] - 1;
2324     idt  = bs*i;
2325     s1   = t[idt];     s2  = t[1+idt];  s3  = t[2+idt];  s4  = t[3+idt];  s5  = t[4+idt];
2326     s6   = t[5+idt];   s7  = t[6+idt];  s8  = t[7+idt];  s9  = t[8+idt];  s10 = t[9+idt];
2327     s11  = t[10+idt]; s12  = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt];
2328 
2329     for(m=0;m<nz;m++){
2330       idx   = bs*vi[m];
2331       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2332       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2333       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2334 
2335       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2336       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2337       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2338       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2339       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2340       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2341       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2342       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2343       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2344       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2345       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2346       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2347       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2348       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2349       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2350 
2351       v += bs2;
2352     }
2353     idc = bs*i;
2354 
2355     x[idc]    = t[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2356     x[1+idc]  = t[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2357     x[2+idc]  = t[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2358     x[3+idc]  = t[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2359     x[4+idc]  = t[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2360     x[5+idc]  = t[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2361     x[6+idc]  = t[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2362     x[7+idc]  = t[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2363     x[8+idc]  = t[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2364     x[9+idc]  = t[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2365     x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2366     x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2367     x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2368     x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2369     x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2370 
2371   }
2372 
2373   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2374   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2375   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2376   PetscFunctionReturn(0);
2377 }
2378 
2379 #undef __FUNCT__
2380 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2381 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2382 {
2383   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2384   IS                iscol=a->col,isrow=a->row;
2385   PetscErrorCode    ierr;
2386   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2387   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2388   PetscInt          i,nz,idx,idt,idc;
2389   const MatScalar   *aa=a->a,*v;
2390   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2391   const PetscScalar *b;
2392 
2393   PetscFunctionBegin;
2394   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2396   t  = a->solve_work;
2397 
2398   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2399   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2400 
2401   /* forward solve the lower triangular */
2402   idx    = 7*(*r++);
2403   t[0] = b[idx];   t[1] = b[1+idx];
2404   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2405   t[5] = b[5+idx]; t[6] = b[6+idx];
2406 
2407   for (i=1; i<n; i++) {
2408     v     = aa + 49*ai[i];
2409     vi    = aj + ai[i];
2410     nz    = diag[i] - ai[i];
2411     idx   = 7*(*r++);
2412     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2413     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2414     while (nz--) {
2415       idx   = 7*(*vi++);
2416       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2417       x4    = t[3+idx];x5 = t[4+idx];
2418       x6    = t[5+idx];x7 = t[6+idx];
2419       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2420       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2421       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2422       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2423       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2424       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2425       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2426       v += 49;
2427     }
2428     idx = 7*i;
2429     t[idx]   = s1;t[1+idx] = s2;
2430     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2431     t[5+idx] = s6;t[6+idx] = s7;
2432   }
2433   /* backward solve the upper triangular */
2434   for (i=n-1; i>=0; i--){
2435     v    = aa + 49*diag[i] + 49;
2436     vi   = aj + diag[i] + 1;
2437     nz   = ai[i+1] - diag[i] - 1;
2438     idt  = 7*i;
2439     s1 = t[idt];  s2 = t[1+idt];
2440     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2441     s6 = t[5+idt];s7 = t[6+idt];
2442     while (nz--) {
2443       idx   = 7*(*vi++);
2444       x1    = t[idx];   x2 = t[1+idx];
2445       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2446       x6    = t[5+idx]; x7 = t[6+idx];
2447       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2448       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2449       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2450       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2451       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2452       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2453       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2454       v += 49;
2455     }
2456     idc = 7*(*c--);
2457     v   = aa + 49*diag[i];
2458     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2459                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2460     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2461                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2462     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2463                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2464     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2465                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2466     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2467                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2468     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2469                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2470     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2471                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2472   }
2473 
2474   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2475   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2476   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2478   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2479   PetscFunctionReturn(0);
2480 }
2481 
2482 #undef __FUNCT__
2483 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2484 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2485 {
2486   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2487   IS                iscol=a->col,isrow=a->row;
2488   PetscErrorCode    ierr;
2489   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2490   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2491   PetscInt          i,nz,idx,idt,idc,m;
2492   const MatScalar   *aa=a->a,*v;
2493   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2494   const PetscScalar *b;
2495 
2496   PetscFunctionBegin;
2497   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2498   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2499   t  = a->solve_work;
2500 
2501   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2502   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2503 
2504   /* forward solve the lower triangular */
2505   idx    = 7*r[0];
2506   t[0] = b[idx];   t[1] = b[1+idx];
2507   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2508   t[5] = b[5+idx]; t[6] = b[6+idx];
2509 
2510   for (i=1; i<n; i++) {
2511     v     = aa + 49*ai[i];
2512     vi    = aj + ai[i];
2513     nz    = ai[i+1] - ai[i];
2514     idx   = 7*r[i];
2515     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2516     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2517     for(m=0;m<nz;m++){
2518       idx   = 7*vi[m];
2519       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2520       x4    = t[3+idx];x5 = t[4+idx];
2521       x6    = t[5+idx];x7 = t[6+idx];
2522       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2523       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2524       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2525       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2526       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2527       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2528       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2529       v += 49;
2530     }
2531     idx = 7*i;
2532     t[idx]   = s1;t[1+idx] = s2;
2533     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2534     t[5+idx] = s6;t[6+idx] = s7;
2535   }
2536   /* backward solve the upper triangular */
2537   for (i=n-1; i>=0; i--){
2538     v    = aa + 49*(adiag[i+1]+1);
2539     vi   = aj + adiag[i+1]+1;
2540     nz   = adiag[i] - adiag[i+1] - 1;
2541     idt  = 7*i;
2542     s1 = t[idt];  s2 = t[1+idt];
2543     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2544     s6 = t[5+idt];s7 = t[6+idt];
2545     for(m=0;m<nz;m++){
2546       idx   = 7*vi[m];
2547       x1    = t[idx];   x2 = t[1+idx];
2548       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2549       x6    = t[5+idx]; x7 = t[6+idx];
2550       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2551       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2552       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2553       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2554       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2555       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2556       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2557       v += 49;
2558     }
2559     idc = 7*c[i];
2560     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2561                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2562     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2563                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2564     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2565                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2566     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2567                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2568     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2569                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2570     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2571                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2572     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2573                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2574   }
2575 
2576   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2577   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2578   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2579   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2580   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2581   PetscFunctionReturn(0);
2582 }
2583 
2584 #undef __FUNCT__
2585 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2586 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2587 {
2588   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2589   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2590   PetscErrorCode    ierr;
2591   PetscInt          i,nz,idx,idt,jdx;
2592   const MatScalar   *aa=a->a,*v;
2593   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2594   const PetscScalar *b;
2595 
2596   PetscFunctionBegin;
2597   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2598   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2599   /* forward solve the lower triangular */
2600   idx    = 0;
2601   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2602   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2603   x[6] = b[6+idx];
2604   for (i=1; i<n; i++) {
2605     v     =  aa + 49*ai[i];
2606     vi    =  aj + ai[i];
2607     nz    =  diag[i] - ai[i];
2608     idx   =  7*i;
2609     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2610     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2611     s7  =  b[6+idx];
2612     while (nz--) {
2613       jdx   = 7*(*vi++);
2614       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2615       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2616       x7    = x[6+jdx];
2617       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2618       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2619       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2620       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2621       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2622       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2623       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2624       v += 49;
2625      }
2626     x[idx]   = s1;
2627     x[1+idx] = s2;
2628     x[2+idx] = s3;
2629     x[3+idx] = s4;
2630     x[4+idx] = s5;
2631     x[5+idx] = s6;
2632     x[6+idx] = s7;
2633   }
2634   /* backward solve the upper triangular */
2635   for (i=n-1; i>=0; i--){
2636     v    = aa + 49*diag[i] + 49;
2637     vi   = aj + diag[i] + 1;
2638     nz   = ai[i+1] - diag[i] - 1;
2639     idt  = 7*i;
2640     s1 = x[idt];   s2 = x[1+idt];
2641     s3 = x[2+idt]; s4 = x[3+idt];
2642     s5 = x[4+idt]; s6 = x[5+idt];
2643     s7 = x[6+idt];
2644     while (nz--) {
2645       idx   = 7*(*vi++);
2646       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2647       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2648       x7    = x[6+idx];
2649       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2650       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2651       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2652       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2653       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2654       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2655       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2656       v += 49;
2657     }
2658     v        = aa + 49*diag[i];
2659     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2660                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2661     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2662                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2663     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2664                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2665     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2666                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2667     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2668                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2669     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2670                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2671     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2672                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2673   }
2674 
2675   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2676   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2677   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2678   PetscFunctionReturn(0);
2679 }
2680 
2681 #undef __FUNCT__
2682 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2683 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2684 {
2685     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2686     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2687     PetscErrorCode    ierr;
2688     PetscInt          i,k,nz,idx,jdx,idt;
2689     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2690     const MatScalar   *aa=a->a,*v;
2691     PetscScalar       *x;
2692     const PetscScalar *b;
2693     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2694 
2695     PetscFunctionBegin;
2696     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2697     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2698     /* forward solve the lower triangular */
2699     idx    = 0;
2700     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2701     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2702     for (i=1; i<n; i++) {
2703        v    = aa + bs2*ai[i];
2704        vi   = aj + ai[i];
2705        nz   = ai[i+1] - ai[i];
2706       idx   = bs*i;
2707        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2708        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2709        for(k=0;k<nz;k++) {
2710           jdx   = bs*vi[k];
2711           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2712 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2713           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2714           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2715           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2716 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2717           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2718 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2719 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2720           v   +=  bs2;
2721         }
2722 
2723        x[idx]   = s1;
2724        x[1+idx] = s2;
2725        x[2+idx] = s3;
2726        x[3+idx] = s4;
2727        x[4+idx] = s5;
2728        x[5+idx] = s6;
2729        x[6+idx] = s7;
2730     }
2731 
2732    /* backward solve the upper triangular */
2733   for (i=n-1; i>=0; i--){
2734     v   = aa + bs2*(adiag[i+1]+1);
2735      vi  = aj + adiag[i+1]+1;
2736      nz  = adiag[i] - adiag[i+1]-1;
2737      idt = bs*i;
2738      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2739      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2740     for(k=0;k<nz;k++) {
2741       idx   = bs*vi[k];
2742        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2743        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2744        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2745        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2746        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2747        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2748        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2749        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2750        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2751         v   +=  bs2;
2752     }
2753     /* x = inv_diagonal*x */
2754     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2755     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2756     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2757     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2758     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2759     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2760     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2761   }
2762 
2763   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2764   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2765   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2766   PetscFunctionReturn(0);
2767 }
2768 
2769 #undef __FUNCT__
2770 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2771 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2772 {
2773   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2774   IS                iscol=a->col,isrow=a->row;
2775   PetscErrorCode    ierr;
2776   const PetscInt    *r,*c,*rout,*cout;
2777   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2778   PetscInt          i,nz,idx,idt,idc;
2779   const MatScalar   *aa=a->a,*v;
2780   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2781   const PetscScalar *b;
2782 
2783   PetscFunctionBegin;
2784   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2785   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2786   t  = a->solve_work;
2787 
2788   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2789   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2790 
2791   /* forward solve the lower triangular */
2792   idx    = 6*(*r++);
2793   t[0] = b[idx];   t[1] = b[1+idx];
2794   t[2] = b[2+idx]; t[3] = b[3+idx];
2795   t[4] = b[4+idx]; t[5] = b[5+idx];
2796   for (i=1; i<n; i++) {
2797     v     = aa + 36*ai[i];
2798     vi    = aj + ai[i];
2799     nz    = diag[i] - ai[i];
2800     idx   = 6*(*r++);
2801     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2802     s5  = b[4+idx]; s6 = b[5+idx];
2803     while (nz--) {
2804       idx   = 6*(*vi++);
2805       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2806       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2807       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2808       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2809       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2810       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2811       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2812       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2813       v += 36;
2814     }
2815     idx = 6*i;
2816     t[idx]   = s1;t[1+idx] = s2;
2817     t[2+idx] = s3;t[3+idx] = s4;
2818     t[4+idx] = s5;t[5+idx] = s6;
2819   }
2820   /* backward solve the upper triangular */
2821   for (i=n-1; i>=0; i--){
2822     v    = aa + 36*diag[i] + 36;
2823     vi   = aj + diag[i] + 1;
2824     nz   = ai[i+1] - diag[i] - 1;
2825     idt  = 6*i;
2826     s1 = t[idt];  s2 = t[1+idt];
2827     s3 = t[2+idt];s4 = t[3+idt];
2828     s5 = t[4+idt];s6 = t[5+idt];
2829     while (nz--) {
2830       idx   = 6*(*vi++);
2831       x1    = t[idx];   x2 = t[1+idx];
2832       x3    = t[2+idx]; x4 = t[3+idx];
2833       x5    = t[4+idx]; x6 = t[5+idx];
2834       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2835       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2836       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2837       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2838       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2839       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2840       v += 36;
2841     }
2842     idc = 6*(*c--);
2843     v   = aa + 36*diag[i];
2844     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2845                                  v[18]*s4+v[24]*s5+v[30]*s6;
2846     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2847                                  v[19]*s4+v[25]*s5+v[31]*s6;
2848     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2849                                  v[20]*s4+v[26]*s5+v[32]*s6;
2850     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2851                                  v[21]*s4+v[27]*s5+v[33]*s6;
2852     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2853                                  v[22]*s4+v[28]*s5+v[34]*s6;
2854     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2855                                  v[23]*s4+v[29]*s5+v[35]*s6;
2856   }
2857 
2858   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2859   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2860   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2861   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2862   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2863   PetscFunctionReturn(0);
2864 }
2865 
2866 #undef __FUNCT__
2867 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2868 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2869 {
2870   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2871   IS                iscol=a->col,isrow=a->row;
2872   PetscErrorCode    ierr;
2873   const PetscInt    *r,*c,*rout,*cout;
2874   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2875   PetscInt          i,nz,idx,idt,idc,m;
2876   const MatScalar   *aa=a->a,*v;
2877   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2878   const PetscScalar *b;
2879 
2880   PetscFunctionBegin;
2881   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2882   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2883   t  = a->solve_work;
2884 
2885   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2886   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2887 
2888   /* forward solve the lower triangular */
2889   idx    = 6*r[0];
2890   t[0] = b[idx];   t[1] = b[1+idx];
2891   t[2] = b[2+idx]; t[3] = b[3+idx];
2892   t[4] = b[4+idx]; t[5] = b[5+idx];
2893   for (i=1; i<n; i++) {
2894     v     = aa + 36*ai[i];
2895     vi    = aj + ai[i];
2896     nz    = ai[i+1] - ai[i];
2897     idx   = 6*r[i];
2898     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2899     s5  = b[4+idx]; s6 = b[5+idx];
2900     for(m=0;m<nz;m++){
2901       idx   = 6*vi[m];
2902       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2903       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2904       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2905       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2906       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2907       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2908       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2909       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2910       v += 36;
2911     }
2912     idx = 6*i;
2913     t[idx]   = s1;t[1+idx] = s2;
2914     t[2+idx] = s3;t[3+idx] = s4;
2915     t[4+idx] = s5;t[5+idx] = s6;
2916   }
2917   /* backward solve the upper triangular */
2918   for (i=n-1; i>=0; i--){
2919     v    = aa + 36*(adiag[i+1]+1);
2920     vi   = aj + adiag[i+1]+1;
2921     nz   = adiag[i] - adiag[i+1] - 1;
2922     idt  = 6*i;
2923     s1 = t[idt];  s2 = t[1+idt];
2924     s3 = t[2+idt];s4 = t[3+idt];
2925     s5 = t[4+idt];s6 = t[5+idt];
2926     for(m=0;m<nz;m++){
2927       idx   = 6*vi[m];
2928       x1    = t[idx];   x2 = t[1+idx];
2929       x3    = t[2+idx]; x4 = t[3+idx];
2930       x5    = t[4+idx]; x6 = t[5+idx];
2931       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2932       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2933       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2934       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2935       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2936       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2937       v += 36;
2938     }
2939     idc = 6*c[i];
2940     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2941                                  v[18]*s4+v[24]*s5+v[30]*s6;
2942     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2943                                  v[19]*s4+v[25]*s5+v[31]*s6;
2944     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2945                                  v[20]*s4+v[26]*s5+v[32]*s6;
2946     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2947                                  v[21]*s4+v[27]*s5+v[33]*s6;
2948     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2949                                  v[22]*s4+v[28]*s5+v[34]*s6;
2950     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2951                                  v[23]*s4+v[29]*s5+v[35]*s6;
2952   }
2953 
2954   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2955   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2956   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2958   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2959   PetscFunctionReturn(0);
2960 }
2961 
2962 #undef __FUNCT__
2963 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
2964 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2965 {
2966   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2967   PetscInt          i,nz,idx,idt,jdx;
2968   PetscErrorCode    ierr;
2969   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
2970   const MatScalar   *aa=a->a,*v;
2971   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2972   const PetscScalar *b;
2973 
2974   PetscFunctionBegin;
2975   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2976   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2977   /* forward solve the lower triangular */
2978   idx    = 0;
2979   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2980   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2981   for (i=1; i<n; i++) {
2982     v     =  aa + 36*ai[i];
2983     vi    =  aj + ai[i];
2984     nz    =  diag[i] - ai[i];
2985     idx   =  6*i;
2986     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2987     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2988     while (nz--) {
2989       jdx   = 6*(*vi++);
2990       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2991       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2992       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2993       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2994       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2995       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2996       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2997       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2998       v += 36;
2999      }
3000     x[idx]   = s1;
3001     x[1+idx] = s2;
3002     x[2+idx] = s3;
3003     x[3+idx] = s4;
3004     x[4+idx] = s5;
3005     x[5+idx] = s6;
3006   }
3007   /* backward solve the upper triangular */
3008   for (i=n-1; i>=0; i--){
3009     v    = aa + 36*diag[i] + 36;
3010     vi   = aj + diag[i] + 1;
3011     nz   = ai[i+1] - diag[i] - 1;
3012     idt  = 6*i;
3013     s1 = x[idt];   s2 = x[1+idt];
3014     s3 = x[2+idt]; s4 = x[3+idt];
3015     s5 = x[4+idt]; s6 = x[5+idt];
3016     while (nz--) {
3017       idx   = 6*(*vi++);
3018       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3019       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3020       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3021       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3022       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3023       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3024       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3025       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3026       v += 36;
3027     }
3028     v        = aa + 36*diag[i];
3029     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3030     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3031     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3032     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3033     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3034     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3035   }
3036 
3037   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3038   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3039   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3040   PetscFunctionReturn(0);
3041 }
3042 
3043 #undef __FUNCT__
3044 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3045 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3046 {
3047     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3048     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3049     PetscErrorCode    ierr;
3050     PetscInt          i,k,nz,idx,jdx,idt;
3051     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3052     const MatScalar   *aa=a->a,*v;
3053     PetscScalar       *x;
3054     const PetscScalar *b;
3055     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3056 
3057     PetscFunctionBegin;
3058     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3059     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3060     /* forward solve the lower triangular */
3061     idx    = 0;
3062     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3063     x[4] = b[4+idx];x[5] = b[5+idx];
3064     for (i=1; i<n; i++) {
3065        v    = aa + bs2*ai[i];
3066        vi   = aj + ai[i];
3067        nz   = ai[i+1] - ai[i];
3068       idx   = bs*i;
3069        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3070        s5   = b[4+idx];s6 = b[5+idx];
3071        for(k=0;k<nz;k++){
3072           jdx   = bs*vi[k];
3073           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3074 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3075           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3076           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3077           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3078 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3079           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3080 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3081           v   +=  bs2;
3082         }
3083 
3084        x[idx]   = s1;
3085        x[1+idx] = s2;
3086        x[2+idx] = s3;
3087        x[3+idx] = s4;
3088        x[4+idx] = s5;
3089        x[5+idx] = s6;
3090     }
3091 
3092    /* backward solve the upper triangular */
3093   for (i=n-1; i>=0; i--){
3094     v   = aa + bs2*(adiag[i+1]+1);
3095      vi  = aj + adiag[i+1]+1;
3096      nz  = adiag[i] - adiag[i+1]-1;
3097      idt = bs*i;
3098      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3099      s5 = x[4+idt];s6 = x[5+idt];
3100      for(k=0;k<nz;k++){
3101       idx   = bs*vi[k];
3102        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3103        x5    = x[4+idx];x6 = x[5+idx];
3104        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3105        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3106        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3107        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3108        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3109        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3110         v   +=  bs2;
3111     }
3112     /* x = inv_diagonal*x */
3113    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3114    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3115    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3116    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3117    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3118    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3119   }
3120 
3121   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3122   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3123   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3124   PetscFunctionReturn(0);
3125 }
3126 
3127 #undef __FUNCT__
3128 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3129 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3130 {
3131   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3132   IS                iscol=a->col,isrow=a->row;
3133   PetscErrorCode    ierr;
3134   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3135   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3136   PetscInt          i,nz,idx,idt,idc;
3137   const MatScalar   *aa=a->a,*v;
3138   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3139   const PetscScalar *b;
3140 
3141   PetscFunctionBegin;
3142   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3143   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3144   t  = a->solve_work;
3145 
3146   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3147   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3148 
3149   /* forward solve the lower triangular */
3150   idx    = 5*(*r++);
3151   t[0] = b[idx];   t[1] = b[1+idx];
3152   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3153   for (i=1; i<n; i++) {
3154     v     = aa + 25*ai[i];
3155     vi    = aj + ai[i];
3156     nz    = diag[i] - ai[i];
3157     idx   = 5*(*r++);
3158     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3159     s5  = b[4+idx];
3160     while (nz--) {
3161       idx   = 5*(*vi++);
3162       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3163       x4    = t[3+idx];x5 = t[4+idx];
3164       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3165       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3166       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3167       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3168       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3169       v += 25;
3170     }
3171     idx = 5*i;
3172     t[idx]   = s1;t[1+idx] = s2;
3173     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3174   }
3175   /* backward solve the upper triangular */
3176   for (i=n-1; i>=0; i--){
3177     v    = aa + 25*diag[i] + 25;
3178     vi   = aj + diag[i] + 1;
3179     nz   = ai[i+1] - diag[i] - 1;
3180     idt  = 5*i;
3181     s1 = t[idt];  s2 = t[1+idt];
3182     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3183     while (nz--) {
3184       idx   = 5*(*vi++);
3185       x1    = t[idx];   x2 = t[1+idx];
3186       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3187       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3188       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3189       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3190       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3191       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3192       v += 25;
3193     }
3194     idc = 5*(*c--);
3195     v   = aa + 25*diag[i];
3196     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3197                                  v[15]*s4+v[20]*s5;
3198     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3199                                  v[16]*s4+v[21]*s5;
3200     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3201                                  v[17]*s4+v[22]*s5;
3202     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3203                                  v[18]*s4+v[23]*s5;
3204     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3205                                  v[19]*s4+v[24]*s5;
3206   }
3207 
3208   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3209   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3210   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3211   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3212   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3213   PetscFunctionReturn(0);
3214 }
3215 
3216 #undef __FUNCT__
3217 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3218 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3219 {
3220   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3221   IS                iscol=a->col,isrow=a->row;
3222   PetscErrorCode    ierr;
3223   const PetscInt    *r,*c,*rout,*cout;
3224   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3225   PetscInt          i,nz,idx,idt,idc,m;
3226   const MatScalar   *aa=a->a,*v;
3227   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3228   const PetscScalar *b;
3229 
3230   PetscFunctionBegin;
3231   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3232   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3233   t  = a->solve_work;
3234 
3235   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3236   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3237 
3238   /* forward solve the lower triangular */
3239   idx    = 5*r[0];
3240   t[0] = b[idx];   t[1] = b[1+idx];
3241   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3242   for (i=1; i<n; i++) {
3243     v     = aa + 25*ai[i];
3244     vi    = aj + ai[i];
3245     nz    = ai[i+1] - ai[i];
3246     idx   = 5*r[i];
3247     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3248     s5  = b[4+idx];
3249     for(m=0;m<nz;m++){
3250       idx   = 5*vi[m];
3251       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3252       x4    = t[3+idx];x5 = t[4+idx];
3253       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3254       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3255       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3256       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3257       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3258       v += 25;
3259     }
3260     idx = 5*i;
3261     t[idx]   = s1;t[1+idx] = s2;
3262     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3263   }
3264   /* backward solve the upper triangular */
3265   for (i=n-1; i>=0; i--){
3266     v    = aa + 25*(adiag[i+1]+1);
3267     vi   = aj + adiag[i+1]+1;
3268     nz   = adiag[i] - adiag[i+1] - 1;
3269     idt  = 5*i;
3270     s1 = t[idt];  s2 = t[1+idt];
3271     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3272     for(m=0;m<nz;m++){
3273       idx   = 5*vi[m];
3274       x1    = t[idx];   x2 = t[1+idx];
3275       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3276       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3277       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3278       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3279       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3280       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3281       v += 25;
3282     }
3283     idc = 5*c[i];
3284     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3285                                  v[15]*s4+v[20]*s5;
3286     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3287                                  v[16]*s4+v[21]*s5;
3288     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3289                                  v[17]*s4+v[22]*s5;
3290     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3291                                  v[18]*s4+v[23]*s5;
3292     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3293                                  v[19]*s4+v[24]*s5;
3294   }
3295 
3296   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3297   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3298   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3299   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3300   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3301   PetscFunctionReturn(0);
3302 }
3303 
3304 #undef __FUNCT__
3305 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3306 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3307 {
3308   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3309   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3310   PetscInt          i,nz,idx,idt,jdx;
3311   PetscErrorCode    ierr;
3312   const MatScalar   *aa=a->a,*v;
3313   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3314   const PetscScalar *b;
3315 
3316   PetscFunctionBegin;
3317   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3318   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3319   /* forward solve the lower triangular */
3320   idx    = 0;
3321   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3322   for (i=1; i<n; i++) {
3323     v     =  aa + 25*ai[i];
3324     vi    =  aj + ai[i];
3325     nz    =  diag[i] - ai[i];
3326     idx   =  5*i;
3327     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3328     while (nz--) {
3329       jdx   = 5*(*vi++);
3330       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3331       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3332       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3333       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3334       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3335       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3336       v    += 25;
3337     }
3338     x[idx]   = s1;
3339     x[1+idx] = s2;
3340     x[2+idx] = s3;
3341     x[3+idx] = s4;
3342     x[4+idx] = s5;
3343   }
3344   /* backward solve the upper triangular */
3345   for (i=n-1; i>=0; i--){
3346     v    = aa + 25*diag[i] + 25;
3347     vi   = aj + diag[i] + 1;
3348     nz   = ai[i+1] - diag[i] - 1;
3349     idt  = 5*i;
3350     s1 = x[idt];  s2 = x[1+idt];
3351     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3352     while (nz--) {
3353       idx   = 5*(*vi++);
3354       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3355       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3356       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3357       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3358       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3359       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3360       v    += 25;
3361     }
3362     v        = aa + 25*diag[i];
3363     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3364     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3365     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3366     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3367     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3368   }
3369 
3370   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3372   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3373   PetscFunctionReturn(0);
3374 }
3375 
3376 #undef __FUNCT__
3377 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3378 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3379 {
3380   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3381   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3382   PetscInt          i,k,nz,idx,idt,jdx;
3383   PetscErrorCode    ierr;
3384   const MatScalar   *aa=a->a,*v;
3385   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3386   const PetscScalar *b;
3387 
3388   PetscFunctionBegin;
3389   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3390   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3391   /* forward solve the lower triangular */
3392   idx    = 0;
3393   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3394   for (i=1; i<n; i++) {
3395     v   = aa + 25*ai[i];
3396     vi  = aj + ai[i];
3397     nz  = ai[i+1] - ai[i];
3398     idx = 5*i;
3399     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3400     for(k=0;k<nz;k++) {
3401       jdx   = 5*vi[k];
3402       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3403       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3404       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3405       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3406       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3407       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3408       v    += 25;
3409     }
3410     x[idx]   = s1;
3411     x[1+idx] = s2;
3412     x[2+idx] = s3;
3413     x[3+idx] = s4;
3414     x[4+idx] = s5;
3415   }
3416 
3417   /* backward solve the upper triangular */
3418   for (i=n-1; i>=0; i--){
3419     v   = aa + 25*(adiag[i+1]+1);
3420     vi  = aj + adiag[i+1]+1;
3421     nz  = adiag[i] - adiag[i+1]-1;
3422     idt = 5*i;
3423     s1 = x[idt];  s2 = x[1+idt];
3424     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3425     for(k=0;k<nz;k++){
3426       idx   = 5*vi[k];
3427       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3428       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3429       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3430       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3431       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3432       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3433       v    += 25;
3434     }
3435     /* x = inv_diagonal*x */
3436     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3437     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3438     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3439     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3440     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3441   }
3442 
3443   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3444   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3445   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3446   PetscFunctionReturn(0);
3447 }
3448 
3449 #undef __FUNCT__
3450 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3451 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3452 {
3453   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3454   IS                iscol=a->col,isrow=a->row;
3455   PetscErrorCode    ierr;
3456   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3457   PetscInt          i,nz,idx,idt,idc;
3458   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3459   const MatScalar   *aa=a->a,*v;
3460   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3461   const PetscScalar *b;
3462 
3463   PetscFunctionBegin;
3464   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3466   t  = a->solve_work;
3467 
3468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3470 
3471   /* forward solve the lower triangular */
3472   idx    = 4*(*r++);
3473   t[0] = b[idx];   t[1] = b[1+idx];
3474   t[2] = b[2+idx]; t[3] = b[3+idx];
3475   for (i=1; i<n; i++) {
3476     v     = aa + 16*ai[i];
3477     vi    = aj + ai[i];
3478     nz    = diag[i] - ai[i];
3479     idx   = 4*(*r++);
3480     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3481     while (nz--) {
3482       idx   = 4*(*vi++);
3483       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3484       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3485       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3486       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3487       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3488       v    += 16;
3489     }
3490     idx        = 4*i;
3491     t[idx]   = s1;t[1+idx] = s2;
3492     t[2+idx] = s3;t[3+idx] = s4;
3493   }
3494   /* backward solve the upper triangular */
3495   for (i=n-1; i>=0; i--){
3496     v    = aa + 16*diag[i] + 16;
3497     vi   = aj + diag[i] + 1;
3498     nz   = ai[i+1] - diag[i] - 1;
3499     idt  = 4*i;
3500     s1 = t[idt];  s2 = t[1+idt];
3501     s3 = t[2+idt];s4 = t[3+idt];
3502     while (nz--) {
3503       idx   = 4*(*vi++);
3504       x1    = t[idx];   x2 = t[1+idx];
3505       x3    = t[2+idx]; x4 = t[3+idx];
3506       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3507       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3508       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3509       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3510       v += 16;
3511     }
3512     idc      = 4*(*c--);
3513     v        = aa + 16*diag[i];
3514     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3515     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3516     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3517     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3518   }
3519 
3520   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3521   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3522   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3523   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3524   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3525   PetscFunctionReturn(0);
3526 }
3527 
3528 #undef __FUNCT__
3529 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3530 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3531 {
3532   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3533   IS                iscol=a->col,isrow=a->row;
3534   PetscErrorCode    ierr;
3535   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3536   PetscInt          i,nz,idx,idt,idc,m;
3537   const PetscInt    *r,*c,*rout,*cout;
3538   const MatScalar   *aa=a->a,*v;
3539   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3540   const PetscScalar *b;
3541 
3542   PetscFunctionBegin;
3543   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3544   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3545   t  = a->solve_work;
3546 
3547   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3548   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3549 
3550   /* forward solve the lower triangular */
3551   idx    = 4*r[0];
3552   t[0] = b[idx];   t[1] = b[1+idx];
3553   t[2] = b[2+idx]; t[3] = b[3+idx];
3554   for (i=1; i<n; i++) {
3555     v     = aa + 16*ai[i];
3556     vi    = aj + ai[i];
3557     nz    = ai[i+1] - ai[i];
3558     idx   = 4*r[i];
3559     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3560     for(m=0;m<nz;m++){
3561       idx   = 4*vi[m];
3562       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3563       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3564       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3565       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3566       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3567       v    += 16;
3568     }
3569     idx        = 4*i;
3570     t[idx]   = s1;t[1+idx] = s2;
3571     t[2+idx] = s3;t[3+idx] = s4;
3572   }
3573   /* backward solve the upper triangular */
3574   for (i=n-1; i>=0; i--){
3575     v    = aa + 16*(adiag[i+1]+1);
3576     vi   = aj + adiag[i+1]+1;
3577     nz   = adiag[i] - adiag[i+1] - 1;
3578     idt  = 4*i;
3579     s1 = t[idt];  s2 = t[1+idt];
3580     s3 = t[2+idt];s4 = t[3+idt];
3581     for(m=0;m<nz;m++){
3582       idx   = 4*vi[m];
3583       x1    = t[idx];   x2 = t[1+idx];
3584       x3    = t[2+idx]; x4 = t[3+idx];
3585       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3586       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3587       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3588       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3589       v += 16;
3590     }
3591     idc      = 4*c[i];
3592     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3593     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3594     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3595     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3596   }
3597 
3598   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3599   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3600   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3601   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3602   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3603   PetscFunctionReturn(0);
3604 }
3605 
3606 #undef __FUNCT__
3607 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3608 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3609 {
3610   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3611   IS                iscol=a->col,isrow=a->row;
3612   PetscErrorCode    ierr;
3613   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3614   PetscInt          i,nz,idx,idt,idc;
3615   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3616   const MatScalar   *aa=a->a,*v;
3617   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3618   PetscScalar       *x;
3619   const PetscScalar *b;
3620 
3621   PetscFunctionBegin;
3622   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3624   t  = (MatScalar *)a->solve_work;
3625 
3626   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3627   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3628 
3629   /* forward solve the lower triangular */
3630   idx    = 4*(*r++);
3631   t[0] = (MatScalar)b[idx];
3632   t[1] = (MatScalar)b[1+idx];
3633   t[2] = (MatScalar)b[2+idx];
3634   t[3] = (MatScalar)b[3+idx];
3635   for (i=1; i<n; i++) {
3636     v     = aa + 16*ai[i];
3637     vi    = aj + ai[i];
3638     nz    = diag[i] - ai[i];
3639     idx   = 4*(*r++);
3640     s1 = (MatScalar)b[idx];
3641     s2 = (MatScalar)b[1+idx];
3642     s3 = (MatScalar)b[2+idx];
3643     s4 = (MatScalar)b[3+idx];
3644     while (nz--) {
3645       idx   = 4*(*vi++);
3646       x1  = t[idx];
3647       x2  = t[1+idx];
3648       x3  = t[2+idx];
3649       x4  = t[3+idx];
3650       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3651       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3652       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3653       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3654       v    += 16;
3655     }
3656     idx        = 4*i;
3657     t[idx]   = s1;
3658     t[1+idx] = s2;
3659     t[2+idx] = s3;
3660     t[3+idx] = s4;
3661   }
3662   /* backward solve the upper triangular */
3663   for (i=n-1; i>=0; i--){
3664     v    = aa + 16*diag[i] + 16;
3665     vi   = aj + diag[i] + 1;
3666     nz   = ai[i+1] - diag[i] - 1;
3667     idt  = 4*i;
3668     s1 = t[idt];
3669     s2 = t[1+idt];
3670     s3 = t[2+idt];
3671     s4 = t[3+idt];
3672     while (nz--) {
3673       idx   = 4*(*vi++);
3674       x1  = t[idx];
3675       x2  = t[1+idx];
3676       x3  = t[2+idx];
3677       x4  = t[3+idx];
3678       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3679       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3680       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3681       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3682       v += 16;
3683     }
3684     idc      = 4*(*c--);
3685     v        = aa + 16*diag[i];
3686     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3687     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3688     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3689     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3690     x[idc]   = (PetscScalar)t[idt];
3691     x[1+idc] = (PetscScalar)t[1+idt];
3692     x[2+idc] = (PetscScalar)t[2+idt];
3693     x[3+idc] = (PetscScalar)t[3+idt];
3694  }
3695 
3696   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3697   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3698   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3699   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3700   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3701   PetscFunctionReturn(0);
3702 }
3703 
3704 #if defined (PETSC_HAVE_SSE)
3705 
3706 #include PETSC_HAVE_SSE
3707 
3708 #undef __FUNCT__
3709 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3710 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3711 {
3712   /*
3713      Note: This code uses demotion of double
3714      to float when performing the mixed-mode computation.
3715      This may not be numerically reasonable for all applications.
3716   */
3717   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3718   IS             iscol=a->col,isrow=a->row;
3719   PetscErrorCode ierr;
3720   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3721   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3722   MatScalar      *aa=a->a,*v;
3723   PetscScalar    *x,*b,*t;
3724 
3725   /* Make space in temp stack for 16 Byte Aligned arrays */
3726   float           ssealignedspace[11],*tmps,*tmpx;
3727   unsigned long   offset;
3728 
3729   PetscFunctionBegin;
3730   SSE_SCOPE_BEGIN;
3731 
3732     offset = (unsigned long)ssealignedspace % 16;
3733     if (offset) offset = (16 - offset)/4;
3734     tmps = &ssealignedspace[offset];
3735     tmpx = &ssealignedspace[offset+4];
3736     PREFETCH_NTA(aa+16*ai[1]);
3737 
3738     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3739     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3740     t  = a->solve_work;
3741 
3742     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3743     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3744 
3745     /* forward solve the lower triangular */
3746     idx  = 4*(*r++);
3747     t[0] = b[idx];   t[1] = b[1+idx];
3748     t[2] = b[2+idx]; t[3] = b[3+idx];
3749     v    =  aa + 16*ai[1];
3750 
3751     for (i=1; i<n;) {
3752       PREFETCH_NTA(&v[8]);
3753       vi   =  aj      + ai[i];
3754       nz   =  diag[i] - ai[i];
3755       idx  =  4*(*r++);
3756 
3757       /* Demote sum from double to float */
3758       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3759       LOAD_PS(tmps,XMM7);
3760 
3761       while (nz--) {
3762         PREFETCH_NTA(&v[16]);
3763         idx = 4*(*vi++);
3764 
3765         /* Demote solution (so far) from double to float */
3766         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3767 
3768         /* 4x4 Matrix-Vector product with negative accumulation: */
3769         SSE_INLINE_BEGIN_2(tmpx,v)
3770           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3771 
3772           /* First Column */
3773           SSE_COPY_PS(XMM0,XMM6)
3774           SSE_SHUFFLE(XMM0,XMM0,0x00)
3775           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3776           SSE_SUB_PS(XMM7,XMM0)
3777 
3778           /* Second Column */
3779           SSE_COPY_PS(XMM1,XMM6)
3780           SSE_SHUFFLE(XMM1,XMM1,0x55)
3781           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3782           SSE_SUB_PS(XMM7,XMM1)
3783 
3784           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3785 
3786           /* Third Column */
3787           SSE_COPY_PS(XMM2,XMM6)
3788           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3789           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3790           SSE_SUB_PS(XMM7,XMM2)
3791 
3792           /* Fourth Column */
3793           SSE_COPY_PS(XMM3,XMM6)
3794           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3795           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3796           SSE_SUB_PS(XMM7,XMM3)
3797         SSE_INLINE_END_2
3798 
3799         v  += 16;
3800       }
3801       idx = 4*i;
3802       v   = aa + 16*ai[++i];
3803       PREFETCH_NTA(v);
3804       STORE_PS(tmps,XMM7);
3805 
3806       /* Promote result from float to double */
3807       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3808     }
3809     /* backward solve the upper triangular */
3810     idt  = 4*(n-1);
3811     ai16 = 16*diag[n-1];
3812     v    = aa + ai16 + 16;
3813     for (i=n-1; i>=0;){
3814       PREFETCH_NTA(&v[8]);
3815       vi = aj + diag[i] + 1;
3816       nz = ai[i+1] - diag[i] - 1;
3817 
3818       /* Demote accumulator from double to float */
3819       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3820       LOAD_PS(tmps,XMM7);
3821 
3822       while (nz--) {
3823         PREFETCH_NTA(&v[16]);
3824         idx = 4*(*vi++);
3825 
3826         /* Demote solution (so far) from double to float */
3827         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3828 
3829         /* 4x4 Matrix-Vector Product with negative accumulation: */
3830         SSE_INLINE_BEGIN_2(tmpx,v)
3831           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3832 
3833           /* First Column */
3834           SSE_COPY_PS(XMM0,XMM6)
3835           SSE_SHUFFLE(XMM0,XMM0,0x00)
3836           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3837           SSE_SUB_PS(XMM7,XMM0)
3838 
3839           /* Second Column */
3840           SSE_COPY_PS(XMM1,XMM6)
3841           SSE_SHUFFLE(XMM1,XMM1,0x55)
3842           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3843           SSE_SUB_PS(XMM7,XMM1)
3844 
3845           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3846 
3847           /* Third Column */
3848           SSE_COPY_PS(XMM2,XMM6)
3849           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3850           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3851           SSE_SUB_PS(XMM7,XMM2)
3852 
3853           /* Fourth Column */
3854           SSE_COPY_PS(XMM3,XMM6)
3855           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3856           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3857           SSE_SUB_PS(XMM7,XMM3)
3858         SSE_INLINE_END_2
3859         v  += 16;
3860       }
3861       v    = aa + ai16;
3862       ai16 = 16*diag[--i];
3863       PREFETCH_NTA(aa+ai16+16);
3864       /*
3865          Scale the result by the diagonal 4x4 block,
3866          which was inverted as part of the factorization
3867       */
3868       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3869         /* First Column */
3870         SSE_COPY_PS(XMM0,XMM7)
3871         SSE_SHUFFLE(XMM0,XMM0,0x00)
3872         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3873 
3874         /* Second Column */
3875         SSE_COPY_PS(XMM1,XMM7)
3876         SSE_SHUFFLE(XMM1,XMM1,0x55)
3877         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3878         SSE_ADD_PS(XMM0,XMM1)
3879 
3880         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3881 
3882         /* Third Column */
3883         SSE_COPY_PS(XMM2,XMM7)
3884         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3885         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3886         SSE_ADD_PS(XMM0,XMM2)
3887 
3888         /* Fourth Column */
3889         SSE_COPY_PS(XMM3,XMM7)
3890         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3891         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3892         SSE_ADD_PS(XMM0,XMM3)
3893 
3894         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3895       SSE_INLINE_END_3
3896 
3897       /* Promote solution from float to double */
3898       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3899 
3900       /* Apply reordering to t and stream into x.    */
3901       /* This way, x doesn't pollute the cache.      */
3902       /* Be careful with size: 2 doubles = 4 floats! */
3903       idc  = 4*(*c--);
3904       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3905         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3906         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3907         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3908         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3909         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3910         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3911       SSE_INLINE_END_2
3912       v    = aa + ai16 + 16;
3913       idt -= 4;
3914     }
3915 
3916     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3917     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3918     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3919     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3920     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3921   SSE_SCOPE_END;
3922   PetscFunctionReturn(0);
3923 }
3924 
3925 #endif
3926 
3927 
3928 /*
3929       Special case where the matrix was ILU(0) factored in the natural
3930    ordering. This eliminates the need for the column and row permutation.
3931 */
3932 #undef __FUNCT__
3933 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
3934 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3935 {
3936   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3937   PetscInt          n=a->mbs;
3938   const PetscInt    *ai=a->i,*aj=a->j;
3939   PetscErrorCode    ierr;
3940   const PetscInt    *diag = a->diag;
3941   const MatScalar   *aa=a->a;
3942   PetscScalar       *x;
3943   const PetscScalar *b;
3944 
3945   PetscFunctionBegin;
3946   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3947   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3948 
3949 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3950   {
3951     static PetscScalar w[2000]; /* very BAD need to fix */
3952     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3953   }
3954 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3955   {
3956     static PetscScalar w[2000]; /* very BAD need to fix */
3957     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3958   }
3959 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3960   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3961 #else
3962   {
3963     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3964     const MatScalar *v;
3965     PetscInt        jdx,idt,idx,nz,i,ai16;
3966     const PetscInt  *vi;
3967 
3968   /* forward solve the lower triangular */
3969   idx    = 0;
3970   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3971   for (i=1; i<n; i++) {
3972     v     =  aa      + 16*ai[i];
3973     vi    =  aj      + ai[i];
3974     nz    =  diag[i] - ai[i];
3975     idx   +=  4;
3976     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3977     while (nz--) {
3978       jdx   = 4*(*vi++);
3979       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3980       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3981       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3982       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3983       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3984       v    += 16;
3985     }
3986     x[idx]   = s1;
3987     x[1+idx] = s2;
3988     x[2+idx] = s3;
3989     x[3+idx] = s4;
3990   }
3991   /* backward solve the upper triangular */
3992   idt = 4*(n-1);
3993   for (i=n-1; i>=0; i--){
3994     ai16 = 16*diag[i];
3995     v    = aa + ai16 + 16;
3996     vi   = aj + diag[i] + 1;
3997     nz   = ai[i+1] - diag[i] - 1;
3998     s1 = x[idt];  s2 = x[1+idt];
3999     s3 = x[2+idt];s4 = x[3+idt];
4000     while (nz--) {
4001       idx   = 4*(*vi++);
4002       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4003       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4004       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4005       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4006       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4007       v    += 16;
4008     }
4009     v        = aa + ai16;
4010     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4011     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4012     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4013     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4014     idt -= 4;
4015   }
4016   }
4017 #endif
4018 
4019   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4020   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4021   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4022   PetscFunctionReturn(0);
4023 }
4024 
4025 #undef __FUNCT__
4026 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4027 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4028 {
4029     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4030     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4031     PetscInt          i,k,nz,idx,jdx,idt;
4032     PetscErrorCode    ierr;
4033     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4034     const MatScalar   *aa=a->a,*v;
4035     PetscScalar       *x;
4036     const PetscScalar *b;
4037     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4038 
4039     PetscFunctionBegin;
4040     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4041     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4042     /* forward solve the lower triangular */
4043     idx    = 0;
4044     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4045     for (i=1; i<n; i++) {
4046        v    = aa + bs2*ai[i];
4047        vi   = aj + ai[i];
4048        nz   = ai[i+1] - ai[i];
4049       idx   = bs*i;
4050        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4051       for(k=0;k<nz;k++) {
4052           jdx   = bs*vi[k];
4053           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4054           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4055           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4056           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4057 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4058 
4059           v   +=  bs2;
4060         }
4061 
4062        x[idx]   = s1;
4063        x[1+idx] = s2;
4064        x[2+idx] = s3;
4065        x[3+idx] = s4;
4066     }
4067 
4068    /* backward solve the upper triangular */
4069   for (i=n-1; i>=0; i--){
4070     v   = aa + bs2*(adiag[i+1]+1);
4071      vi  = aj + adiag[i+1]+1;
4072      nz  = adiag[i] - adiag[i+1]-1;
4073      idt = bs*i;
4074      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4075 
4076     for(k=0;k<nz;k++){
4077       idx   = bs*vi[k];
4078        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4079        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4080        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4081        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4082        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4083 
4084         v   +=  bs2;
4085     }
4086     /* x = inv_diagonal*x */
4087    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4088    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4089    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4090    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4091 
4092   }
4093 
4094   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4095   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4096   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4097   PetscFunctionReturn(0);
4098 }
4099 
4100 #undef __FUNCT__
4101 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4102 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4103 {
4104   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4105   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4106   PetscErrorCode    ierr;
4107   const MatScalar   *aa=a->a;
4108   const PetscScalar *b;
4109   PetscScalar       *x;
4110 
4111   PetscFunctionBegin;
4112   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4113   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4114 
4115   {
4116     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4117     const MatScalar  *v;
4118     MatScalar        *t=(MatScalar *)x;
4119     PetscInt         jdx,idt,idx,nz,i,ai16;
4120     const PetscInt   *vi;
4121 
4122     /* forward solve the lower triangular */
4123     idx  = 0;
4124     t[0] = (MatScalar)b[0];
4125     t[1] = (MatScalar)b[1];
4126     t[2] = (MatScalar)b[2];
4127     t[3] = (MatScalar)b[3];
4128     for (i=1; i<n; i++) {
4129       v     =  aa      + 16*ai[i];
4130       vi    =  aj      + ai[i];
4131       nz    =  diag[i] - ai[i];
4132       idx   +=  4;
4133       s1 = (MatScalar)b[idx];
4134       s2 = (MatScalar)b[1+idx];
4135       s3 = (MatScalar)b[2+idx];
4136       s4 = (MatScalar)b[3+idx];
4137       while (nz--) {
4138         jdx = 4*(*vi++);
4139         x1  = t[jdx];
4140         x2  = t[1+jdx];
4141         x3  = t[2+jdx];
4142         x4  = t[3+jdx];
4143         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4144         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4145         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4146         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4147         v    += 16;
4148       }
4149       t[idx]   = s1;
4150       t[1+idx] = s2;
4151       t[2+idx] = s3;
4152       t[3+idx] = s4;
4153     }
4154     /* backward solve the upper triangular */
4155     idt = 4*(n-1);
4156     for (i=n-1; i>=0; i--){
4157       ai16 = 16*diag[i];
4158       v    = aa + ai16 + 16;
4159       vi   = aj + diag[i] + 1;
4160       nz   = ai[i+1] - diag[i] - 1;
4161       s1   = t[idt];
4162       s2   = t[1+idt];
4163       s3   = t[2+idt];
4164       s4   = t[3+idt];
4165       while (nz--) {
4166         idx = 4*(*vi++);
4167         x1  = (MatScalar)x[idx];
4168         x2  = (MatScalar)x[1+idx];
4169         x3  = (MatScalar)x[2+idx];
4170         x4  = (MatScalar)x[3+idx];
4171         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4172         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4173         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4174         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4175         v    += 16;
4176       }
4177       v        = aa + ai16;
4178       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4179       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4180       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4181       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4182       idt -= 4;
4183     }
4184   }
4185 
4186   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4187   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4188   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4189   PetscFunctionReturn(0);
4190 }
4191 
4192 #if defined (PETSC_HAVE_SSE)
4193 
4194 #include PETSC_HAVE_SSE
4195 #undef __FUNCT__
4196 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4197 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4198 {
4199   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4200   unsigned short *aj=(unsigned short *)a->j;
4201   PetscErrorCode ierr;
4202   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4203   MatScalar      *aa=a->a;
4204   PetscScalar    *x,*b;
4205 
4206   PetscFunctionBegin;
4207   SSE_SCOPE_BEGIN;
4208   /*
4209      Note: This code currently uses demotion of double
4210      to float when performing the mixed-mode computation.
4211      This may not be numerically reasonable for all applications.
4212   */
4213   PREFETCH_NTA(aa+16*ai[1]);
4214 
4215   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4216   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4217   {
4218     /* x will first be computed in single precision then promoted inplace to double */
4219     MatScalar      *v,*t=(MatScalar *)x;
4220     int            nz,i,idt,ai16;
4221     unsigned int   jdx,idx;
4222     unsigned short *vi;
4223     /* Forward solve the lower triangular factor. */
4224 
4225     /* First block is the identity. */
4226     idx  = 0;
4227     CONVERT_DOUBLE4_FLOAT4(t,b);
4228     v    =  aa + 16*((unsigned int)ai[1]);
4229 
4230     for (i=1; i<n;) {
4231       PREFETCH_NTA(&v[8]);
4232       vi   =  aj      + ai[i];
4233       nz   =  diag[i] - ai[i];
4234       idx +=  4;
4235 
4236       /* Demote RHS from double to float. */
4237       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4238       LOAD_PS(&t[idx],XMM7);
4239 
4240       while (nz--) {
4241         PREFETCH_NTA(&v[16]);
4242         jdx = 4*((unsigned int)(*vi++));
4243 
4244         /* 4x4 Matrix-Vector product with negative accumulation: */
4245         SSE_INLINE_BEGIN_2(&t[jdx],v)
4246           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4247 
4248           /* First Column */
4249           SSE_COPY_PS(XMM0,XMM6)
4250           SSE_SHUFFLE(XMM0,XMM0,0x00)
4251           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4252           SSE_SUB_PS(XMM7,XMM0)
4253 
4254           /* Second Column */
4255           SSE_COPY_PS(XMM1,XMM6)
4256           SSE_SHUFFLE(XMM1,XMM1,0x55)
4257           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4258           SSE_SUB_PS(XMM7,XMM1)
4259 
4260           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4261 
4262           /* Third Column */
4263           SSE_COPY_PS(XMM2,XMM6)
4264           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4265           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4266           SSE_SUB_PS(XMM7,XMM2)
4267 
4268           /* Fourth Column */
4269           SSE_COPY_PS(XMM3,XMM6)
4270           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4271           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4272           SSE_SUB_PS(XMM7,XMM3)
4273         SSE_INLINE_END_2
4274 
4275         v  += 16;
4276       }
4277       v    =  aa + 16*ai[++i];
4278       PREFETCH_NTA(v);
4279       STORE_PS(&t[idx],XMM7);
4280     }
4281 
4282     /* Backward solve the upper triangular factor.*/
4283 
4284     idt  = 4*(n-1);
4285     ai16 = 16*diag[n-1];
4286     v    = aa + ai16 + 16;
4287     for (i=n-1; i>=0;){
4288       PREFETCH_NTA(&v[8]);
4289       vi = aj + diag[i] + 1;
4290       nz = ai[i+1] - diag[i] - 1;
4291 
4292       LOAD_PS(&t[idt],XMM7);
4293 
4294       while (nz--) {
4295         PREFETCH_NTA(&v[16]);
4296         idx = 4*((unsigned int)(*vi++));
4297 
4298         /* 4x4 Matrix-Vector Product with negative accumulation: */
4299         SSE_INLINE_BEGIN_2(&t[idx],v)
4300           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4301 
4302           /* First Column */
4303           SSE_COPY_PS(XMM0,XMM6)
4304           SSE_SHUFFLE(XMM0,XMM0,0x00)
4305           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4306           SSE_SUB_PS(XMM7,XMM0)
4307 
4308           /* Second Column */
4309           SSE_COPY_PS(XMM1,XMM6)
4310           SSE_SHUFFLE(XMM1,XMM1,0x55)
4311           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4312           SSE_SUB_PS(XMM7,XMM1)
4313 
4314           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4315 
4316           /* Third Column */
4317           SSE_COPY_PS(XMM2,XMM6)
4318           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4319           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4320           SSE_SUB_PS(XMM7,XMM2)
4321 
4322           /* Fourth Column */
4323           SSE_COPY_PS(XMM3,XMM6)
4324           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4325           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4326           SSE_SUB_PS(XMM7,XMM3)
4327         SSE_INLINE_END_2
4328         v  += 16;
4329       }
4330       v    = aa + ai16;
4331       ai16 = 16*diag[--i];
4332       PREFETCH_NTA(aa+ai16+16);
4333       /*
4334          Scale the result by the diagonal 4x4 block,
4335          which was inverted as part of the factorization
4336       */
4337       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4338         /* First Column */
4339         SSE_COPY_PS(XMM0,XMM7)
4340         SSE_SHUFFLE(XMM0,XMM0,0x00)
4341         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4342 
4343         /* Second Column */
4344         SSE_COPY_PS(XMM1,XMM7)
4345         SSE_SHUFFLE(XMM1,XMM1,0x55)
4346         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4347         SSE_ADD_PS(XMM0,XMM1)
4348 
4349         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4350 
4351         /* Third Column */
4352         SSE_COPY_PS(XMM2,XMM7)
4353         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4354         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4355         SSE_ADD_PS(XMM0,XMM2)
4356 
4357         /* Fourth Column */
4358         SSE_COPY_PS(XMM3,XMM7)
4359         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4360         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4361         SSE_ADD_PS(XMM0,XMM3)
4362 
4363         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4364       SSE_INLINE_END_3
4365 
4366       v    = aa + ai16 + 16;
4367       idt -= 4;
4368     }
4369 
4370     /* Convert t from single precision back to double precision (inplace)*/
4371     idt = 4*(n-1);
4372     for (i=n-1;i>=0;i--) {
4373       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4374       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4375       PetscScalar *xtemp=&x[idt];
4376       MatScalar   *ttemp=&t[idt];
4377       xtemp[3] = (PetscScalar)ttemp[3];
4378       xtemp[2] = (PetscScalar)ttemp[2];
4379       xtemp[1] = (PetscScalar)ttemp[1];
4380       xtemp[0] = (PetscScalar)ttemp[0];
4381       idt -= 4;
4382     }
4383 
4384   } /* End of artificial scope. */
4385   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4386   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4387   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4388   SSE_SCOPE_END;
4389   PetscFunctionReturn(0);
4390 }
4391 
4392 #undef __FUNCT__
4393 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4394 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4395 {
4396   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4397   int            *aj=a->j;
4398   PetscErrorCode ierr;
4399   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4400   MatScalar      *aa=a->a;
4401   PetscScalar    *x,*b;
4402 
4403   PetscFunctionBegin;
4404   SSE_SCOPE_BEGIN;
4405   /*
4406      Note: This code currently uses demotion of double
4407      to float when performing the mixed-mode computation.
4408      This may not be numerically reasonable for all applications.
4409   */
4410   PREFETCH_NTA(aa+16*ai[1]);
4411 
4412   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4413   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4414   {
4415     /* x will first be computed in single precision then promoted inplace to double */
4416     MatScalar *v,*t=(MatScalar *)x;
4417     int       nz,i,idt,ai16;
4418     int       jdx,idx;
4419     int       *vi;
4420     /* Forward solve the lower triangular factor. */
4421 
4422     /* First block is the identity. */
4423     idx  = 0;
4424     CONVERT_DOUBLE4_FLOAT4(t,b);
4425     v    =  aa + 16*ai[1];
4426 
4427     for (i=1; i<n;) {
4428       PREFETCH_NTA(&v[8]);
4429       vi   =  aj      + ai[i];
4430       nz   =  diag[i] - ai[i];
4431       idx +=  4;
4432 
4433       /* Demote RHS from double to float. */
4434       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4435       LOAD_PS(&t[idx],XMM7);
4436 
4437       while (nz--) {
4438         PREFETCH_NTA(&v[16]);
4439         jdx = 4*(*vi++);
4440 /*          jdx = *vi++; */
4441 
4442         /* 4x4 Matrix-Vector product with negative accumulation: */
4443         SSE_INLINE_BEGIN_2(&t[jdx],v)
4444           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4445 
4446           /* First Column */
4447           SSE_COPY_PS(XMM0,XMM6)
4448           SSE_SHUFFLE(XMM0,XMM0,0x00)
4449           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4450           SSE_SUB_PS(XMM7,XMM0)
4451 
4452           /* Second Column */
4453           SSE_COPY_PS(XMM1,XMM6)
4454           SSE_SHUFFLE(XMM1,XMM1,0x55)
4455           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4456           SSE_SUB_PS(XMM7,XMM1)
4457 
4458           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4459 
4460           /* Third Column */
4461           SSE_COPY_PS(XMM2,XMM6)
4462           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4463           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4464           SSE_SUB_PS(XMM7,XMM2)
4465 
4466           /* Fourth Column */
4467           SSE_COPY_PS(XMM3,XMM6)
4468           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4469           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4470           SSE_SUB_PS(XMM7,XMM3)
4471         SSE_INLINE_END_2
4472 
4473         v  += 16;
4474       }
4475       v    =  aa + 16*ai[++i];
4476       PREFETCH_NTA(v);
4477       STORE_PS(&t[idx],XMM7);
4478     }
4479 
4480     /* Backward solve the upper triangular factor.*/
4481 
4482     idt  = 4*(n-1);
4483     ai16 = 16*diag[n-1];
4484     v    = aa + ai16 + 16;
4485     for (i=n-1; i>=0;){
4486       PREFETCH_NTA(&v[8]);
4487       vi = aj + diag[i] + 1;
4488       nz = ai[i+1] - diag[i] - 1;
4489 
4490       LOAD_PS(&t[idt],XMM7);
4491 
4492       while (nz--) {
4493         PREFETCH_NTA(&v[16]);
4494         idx = 4*(*vi++);
4495 /*          idx = *vi++; */
4496 
4497         /* 4x4 Matrix-Vector Product with negative accumulation: */
4498         SSE_INLINE_BEGIN_2(&t[idx],v)
4499           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4500 
4501           /* First Column */
4502           SSE_COPY_PS(XMM0,XMM6)
4503           SSE_SHUFFLE(XMM0,XMM0,0x00)
4504           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4505           SSE_SUB_PS(XMM7,XMM0)
4506 
4507           /* Second Column */
4508           SSE_COPY_PS(XMM1,XMM6)
4509           SSE_SHUFFLE(XMM1,XMM1,0x55)
4510           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4511           SSE_SUB_PS(XMM7,XMM1)
4512 
4513           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4514 
4515           /* Third Column */
4516           SSE_COPY_PS(XMM2,XMM6)
4517           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4518           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4519           SSE_SUB_PS(XMM7,XMM2)
4520 
4521           /* Fourth Column */
4522           SSE_COPY_PS(XMM3,XMM6)
4523           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4524           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4525           SSE_SUB_PS(XMM7,XMM3)
4526         SSE_INLINE_END_2
4527         v  += 16;
4528       }
4529       v    = aa + ai16;
4530       ai16 = 16*diag[--i];
4531       PREFETCH_NTA(aa+ai16+16);
4532       /*
4533          Scale the result by the diagonal 4x4 block,
4534          which was inverted as part of the factorization
4535       */
4536       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4537         /* First Column */
4538         SSE_COPY_PS(XMM0,XMM7)
4539         SSE_SHUFFLE(XMM0,XMM0,0x00)
4540         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4541 
4542         /* Second Column */
4543         SSE_COPY_PS(XMM1,XMM7)
4544         SSE_SHUFFLE(XMM1,XMM1,0x55)
4545         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4546         SSE_ADD_PS(XMM0,XMM1)
4547 
4548         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4549 
4550         /* Third Column */
4551         SSE_COPY_PS(XMM2,XMM7)
4552         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4553         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4554         SSE_ADD_PS(XMM0,XMM2)
4555 
4556         /* Fourth Column */
4557         SSE_COPY_PS(XMM3,XMM7)
4558         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4559         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4560         SSE_ADD_PS(XMM0,XMM3)
4561 
4562         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4563       SSE_INLINE_END_3
4564 
4565       v    = aa + ai16 + 16;
4566       idt -= 4;
4567     }
4568 
4569     /* Convert t from single precision back to double precision (inplace)*/
4570     idt = 4*(n-1);
4571     for (i=n-1;i>=0;i--) {
4572       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4573       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4574       PetscScalar *xtemp=&x[idt];
4575       MatScalar   *ttemp=&t[idt];
4576       xtemp[3] = (PetscScalar)ttemp[3];
4577       xtemp[2] = (PetscScalar)ttemp[2];
4578       xtemp[1] = (PetscScalar)ttemp[1];
4579       xtemp[0] = (PetscScalar)ttemp[0];
4580       idt -= 4;
4581     }
4582 
4583   } /* End of artificial scope. */
4584   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4585   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4586   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4587   SSE_SCOPE_END;
4588   PetscFunctionReturn(0);
4589 }
4590 
4591 #endif
4592 
4593 #undef __FUNCT__
4594 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4595 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4596 {
4597   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4598   IS                iscol=a->col,isrow=a->row;
4599   PetscErrorCode    ierr;
4600   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4601   PetscInt          i,nz,idx,idt,idc;
4602   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4603   const MatScalar   *aa=a->a,*v;
4604   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4605   const PetscScalar *b;
4606 
4607   PetscFunctionBegin;
4608   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4609   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4610   t  = a->solve_work;
4611 
4612   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4613   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4614 
4615   /* forward solve the lower triangular */
4616   idx    = 3*(*r++);
4617   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4618   for (i=1; i<n; i++) {
4619     v     = aa + 9*ai[i];
4620     vi    = aj + ai[i];
4621     nz    = diag[i] - ai[i];
4622     idx   = 3*(*r++);
4623     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4624     while (nz--) {
4625       idx   = 3*(*vi++);
4626       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4627       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4628       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4629       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4630       v += 9;
4631     }
4632     idx = 3*i;
4633     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4634   }
4635   /* backward solve the upper triangular */
4636   for (i=n-1; i>=0; i--){
4637     v    = aa + 9*diag[i] + 9;
4638     vi   = aj + diag[i] + 1;
4639     nz   = ai[i+1] - diag[i] - 1;
4640     idt  = 3*i;
4641     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4642     while (nz--) {
4643       idx   = 3*(*vi++);
4644       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4645       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4646       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4647       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4648       v += 9;
4649     }
4650     idc = 3*(*c--);
4651     v   = aa + 9*diag[i];
4652     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4653     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4654     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4655   }
4656   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4657   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4658   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4659   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4660   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4661   PetscFunctionReturn(0);
4662 }
4663 
4664 #undef __FUNCT__
4665 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4666 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4667 {
4668   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4669   IS                iscol=a->col,isrow=a->row;
4670   PetscErrorCode    ierr;
4671   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4672   PetscInt          i,nz,idx,idt,idc,m;
4673   const PetscInt    *r,*c,*rout,*cout;
4674   const MatScalar   *aa=a->a,*v;
4675   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4676   const PetscScalar *b;
4677 
4678   PetscFunctionBegin;
4679   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4680   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4681   t  = a->solve_work;
4682 
4683   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4684   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4685 
4686   /* forward solve the lower triangular */
4687   idx    = 3*r[0];
4688   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4689   for (i=1; i<n; i++) {
4690     v     = aa + 9*ai[i];
4691     vi    = aj + ai[i];
4692     nz    = ai[i+1] - ai[i];
4693     idx   = 3*r[i];
4694     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4695     for(m=0;m<nz;m++){
4696       idx   = 3*vi[m];
4697       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4698       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4699       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4700       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4701       v += 9;
4702     }
4703     idx = 3*i;
4704     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4705   }
4706   /* backward solve the upper triangular */
4707   for (i=n-1; i>=0; i--){
4708     v    = aa + 9*(adiag[i+1]+1);
4709     vi   = aj + adiag[i+1]+1;
4710     nz   = adiag[i] - adiag[i+1] - 1;
4711     idt  = 3*i;
4712     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4713     for(m=0;m<nz;m++){
4714       idx   = 3*vi[m];
4715       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4716       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4717       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4718       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4719       v += 9;
4720     }
4721     idc = 3*c[i];
4722     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4723     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4724     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4725   }
4726   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4727   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4728   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4729   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4730   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4731   PetscFunctionReturn(0);
4732 }
4733 
4734 /*
4735       Special case where the matrix was ILU(0) factored in the natural
4736    ordering. This eliminates the need for the column and row permutation.
4737 */
4738 #undef __FUNCT__
4739 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4740 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4741 {
4742   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4743   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4744   PetscErrorCode    ierr;
4745   const PetscInt    *diag = a->diag,*vi;
4746   const MatScalar   *aa=a->a,*v;
4747   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4748   const PetscScalar *b;
4749   PetscInt          jdx,idt,idx,nz,i;
4750 
4751   PetscFunctionBegin;
4752   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4753   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4754 
4755   /* forward solve the lower triangular */
4756   idx    = 0;
4757   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4758   for (i=1; i<n; i++) {
4759     v     =  aa      + 9*ai[i];
4760     vi    =  aj      + ai[i];
4761     nz    =  diag[i] - ai[i];
4762     idx   +=  3;
4763     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4764     while (nz--) {
4765       jdx   = 3*(*vi++);
4766       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4767       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4768       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4769       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4770       v    += 9;
4771     }
4772     x[idx]   = s1;
4773     x[1+idx] = s2;
4774     x[2+idx] = s3;
4775   }
4776   /* backward solve the upper triangular */
4777   for (i=n-1; i>=0; i--){
4778     v    = aa + 9*diag[i] + 9;
4779     vi   = aj + diag[i] + 1;
4780     nz   = ai[i+1] - diag[i] - 1;
4781     idt  = 3*i;
4782     s1 = x[idt];  s2 = x[1+idt];
4783     s3 = x[2+idt];
4784     while (nz--) {
4785       idx   = 3*(*vi++);
4786       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4787       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4788       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4789       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4790       v    += 9;
4791     }
4792     v        = aa +  9*diag[i];
4793     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4794     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4795     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4796   }
4797 
4798   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4799   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4800   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4801   PetscFunctionReturn(0);
4802 }
4803 
4804 #undef __FUNCT__
4805 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4806 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4807 {
4808     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4809     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4810     PetscErrorCode    ierr;
4811     PetscInt          i,k,nz,idx,jdx,idt;
4812     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4813     const MatScalar   *aa=a->a,*v;
4814     PetscScalar       *x;
4815     const PetscScalar *b;
4816     PetscScalar        s1,s2,s3,x1,x2,x3;
4817 
4818     PetscFunctionBegin;
4819     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4820     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4821     /* forward solve the lower triangular */
4822     idx    = 0;
4823     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4824     for (i=1; i<n; i++) {
4825        v    = aa + bs2*ai[i];
4826        vi   = aj + ai[i];
4827        nz   = ai[i+1] - ai[i];
4828       idx   = bs*i;
4829        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4830       for(k=0;k<nz;k++){
4831          jdx   = bs*vi[k];
4832           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4833           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4834           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4835           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4836 
4837           v   +=  bs2;
4838         }
4839 
4840        x[idx]   = s1;
4841        x[1+idx] = s2;
4842        x[2+idx] = s3;
4843     }
4844 
4845    /* backward solve the upper triangular */
4846   for (i=n-1; i>=0; i--){
4847     v   = aa + bs2*(adiag[i+1]+1);
4848      vi  = aj + adiag[i+1]+1;
4849      nz  = adiag[i] - adiag[i+1]-1;
4850      idt = bs*i;
4851      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4852 
4853      for(k=0;k<nz;k++){
4854        idx   = bs*vi[k];
4855        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4856        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4857        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4858        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4859 
4860         v   +=  bs2;
4861     }
4862     /* x = inv_diagonal*x */
4863    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4864    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4865    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4866 
4867   }
4868 
4869   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4870   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4871   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4872   PetscFunctionReturn(0);
4873 }
4874 
4875 #undef __FUNCT__
4876 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4877 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4878 {
4879   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4880   IS                iscol=a->col,isrow=a->row;
4881   PetscErrorCode    ierr;
4882   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4883   PetscInt          i,nz,idx,idt,idc;
4884   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4885   const MatScalar   *aa=a->a,*v;
4886   PetscScalar       *x,s1,s2,x1,x2,*t;
4887   const PetscScalar *b;
4888 
4889   PetscFunctionBegin;
4890   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4891   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4892   t  = a->solve_work;
4893 
4894   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4895   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4896 
4897   /* forward solve the lower triangular */
4898   idx    = 2*(*r++);
4899   t[0] = b[idx]; t[1] = b[1+idx];
4900   for (i=1; i<n; i++) {
4901     v     = aa + 4*ai[i];
4902     vi    = aj + ai[i];
4903     nz    = diag[i] - ai[i];
4904     idx   = 2*(*r++);
4905     s1  = b[idx]; s2 = b[1+idx];
4906     while (nz--) {
4907       idx   = 2*(*vi++);
4908       x1    = t[idx]; x2 = t[1+idx];
4909       s1 -= v[0]*x1 + v[2]*x2;
4910       s2 -= v[1]*x1 + v[3]*x2;
4911       v += 4;
4912     }
4913     idx = 2*i;
4914     t[idx] = s1; t[1+idx] = s2;
4915   }
4916   /* backward solve the upper triangular */
4917   for (i=n-1; i>=0; i--){
4918     v    = aa + 4*diag[i] + 4;
4919     vi   = aj + diag[i] + 1;
4920     nz   = ai[i+1] - diag[i] - 1;
4921     idt  = 2*i;
4922     s1 = t[idt]; s2 = t[1+idt];
4923     while (nz--) {
4924       idx   = 2*(*vi++);
4925       x1    = t[idx]; x2 = t[1+idx];
4926       s1 -= v[0]*x1 + v[2]*x2;
4927       s2 -= v[1]*x1 + v[3]*x2;
4928       v += 4;
4929     }
4930     idc = 2*(*c--);
4931     v   = aa + 4*diag[i];
4932     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4933     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4934   }
4935   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4936   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4937   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4938   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4939   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4940   PetscFunctionReturn(0);
4941 }
4942 
4943 #undef __FUNCT__
4944 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4945 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4946 {
4947   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4948   IS                iscol=a->col,isrow=a->row;
4949   PetscErrorCode    ierr;
4950   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4951   PetscInt          i,nz,idx,jdx,idt,idc,m;
4952   const PetscInt    *r,*c,*rout,*cout;
4953   const MatScalar   *aa=a->a,*v;
4954   PetscScalar       *x,s1,s2,x1,x2,*t;
4955   const PetscScalar *b;
4956 
4957   PetscFunctionBegin;
4958   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4959   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4960   t  = a->solve_work;
4961 
4962   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4963   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4964 
4965   /* forward solve the lower triangular */
4966   idx    = 2*r[0];
4967   t[0] = b[idx]; t[1] = b[1+idx];
4968   for (i=1; i<n; i++) {
4969     v     = aa + 4*ai[i];
4970     vi    = aj + ai[i];
4971     nz    = ai[i+1] - ai[i];
4972     idx   = 2*r[i];
4973     s1  = b[idx]; s2 = b[1+idx];
4974     for(m=0;m<nz;m++){
4975       jdx   = 2*vi[m];
4976       x1    = t[jdx]; x2 = t[1+jdx];
4977       s1 -= v[0]*x1 + v[2]*x2;
4978       s2 -= v[1]*x1 + v[3]*x2;
4979       v += 4;
4980     }
4981     idx = 2*i;
4982     t[idx] = s1; t[1+idx] = s2;
4983   }
4984   /* backward solve the upper triangular */
4985   for (i=n-1; i>=0; i--){
4986     v    = aa + 4*(adiag[i+1]+1);
4987     vi   = aj + adiag[i+1]+1;
4988     nz   = adiag[i] - adiag[i+1] - 1;
4989     idt  = 2*i;
4990     s1 = t[idt]; s2 = t[1+idt];
4991     for(m=0;m<nz;m++){
4992       idx   = 2*vi[m];
4993       x1    = t[idx]; x2 = t[1+idx];
4994       s1 -= v[0]*x1 + v[2]*x2;
4995       s2 -= v[1]*x1 + v[3]*x2;
4996       v += 4;
4997     }
4998     idc = 2*c[i];
4999     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5000     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5001   }
5002   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5003   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5004   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5005   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5006   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5007   PetscFunctionReturn(0);
5008 }
5009 
5010 /*
5011       Special case where the matrix was ILU(0) factored in the natural
5012    ordering. This eliminates the need for the column and row permutation.
5013 */
5014 #undef __FUNCT__
5015 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
5016 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5017 {
5018   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5019   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5020   PetscErrorCode    ierr;
5021   const MatScalar   *aa=a->a,*v;
5022   PetscScalar       *x,s1,s2,x1,x2;
5023   const PetscScalar *b;
5024   PetscInt          jdx,idt,idx,nz,i;
5025 
5026   PetscFunctionBegin;
5027   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5028   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5029 
5030   /* forward solve the lower triangular */
5031   idx    = 0;
5032   x[0]   = b[0]; x[1] = b[1];
5033   for (i=1; i<n; i++) {
5034     v     =  aa      + 4*ai[i];
5035     vi    =  aj      + ai[i];
5036     nz    =  diag[i] - ai[i];
5037     idx   +=  2;
5038     s1  =  b[idx];s2 = b[1+idx];
5039     while (nz--) {
5040       jdx   = 2*(*vi++);
5041       x1    = x[jdx];x2 = x[1+jdx];
5042       s1 -= v[0]*x1 + v[2]*x2;
5043       s2 -= v[1]*x1 + v[3]*x2;
5044       v    += 4;
5045     }
5046     x[idx]   = s1;
5047     x[1+idx] = s2;
5048   }
5049   /* backward solve the upper triangular */
5050   for (i=n-1; i>=0; i--){
5051     v    = aa + 4*diag[i] + 4;
5052     vi   = aj + diag[i] + 1;
5053     nz   = ai[i+1] - diag[i] - 1;
5054     idt  = 2*i;
5055     s1 = x[idt];  s2 = x[1+idt];
5056     while (nz--) {
5057       idx   = 2*(*vi++);
5058       x1    = x[idx];   x2 = x[1+idx];
5059       s1 -= v[0]*x1 + v[2]*x2;
5060       s2 -= v[1]*x1 + v[3]*x2;
5061       v    += 4;
5062     }
5063     v        = aa +  4*diag[i];
5064     x[idt]   = v[0]*s1 + v[2]*s2;
5065     x[1+idt] = v[1]*s1 + v[3]*s2;
5066   }
5067 
5068   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5069   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5070   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5071   PetscFunctionReturn(0);
5072 }
5073 
5074 #undef __FUNCT__
5075 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5076 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5077 {
5078     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5079     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5080     PetscInt          i,k,nz,idx,idt,jdx;
5081     PetscErrorCode    ierr;
5082     const MatScalar   *aa=a->a,*v;
5083     PetscScalar       *x,s1,s2,x1,x2;
5084     const PetscScalar *b;
5085 
5086     PetscFunctionBegin;
5087     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5088     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5089     /* forward solve the lower triangular */
5090     idx    = 0;
5091     x[0] = b[idx]; x[1] = b[1+idx];
5092     for (i=1; i<n; i++) {
5093         v   = aa + 4*ai[i];
5094        vi   = aj + ai[i];
5095        nz   = ai[i+1] - ai[i];
5096        idx  = 2*i;
5097        s1   = b[idx];s2 = b[1+idx];
5098       for(k=0;k<nz;k++){
5099          jdx   = 2*vi[k];
5100           x1    = x[jdx];x2 = x[1+jdx];
5101           s1   -= v[0]*x1 + v[2]*x2;
5102           s2   -= v[1]*x1 + v[3]*x2;
5103            v   +=  4;
5104         }
5105        x[idx]   = s1;
5106        x[1+idx] = s2;
5107     }
5108 
5109    /* backward solve the upper triangular */
5110   for (i=n-1; i>=0; i--){
5111      v   = aa + 4*(adiag[i+1]+1);
5112      vi  = aj + adiag[i+1]+1;
5113      nz  = adiag[i] - adiag[i+1]-1;
5114      idt = 2*i;
5115      s1 = x[idt];  s2 = x[1+idt];
5116      for(k=0;k<nz;k++){
5117       idx   = 2*vi[k];
5118        x1    = x[idx];   x2 = x[1+idx];
5119        s1 -= v[0]*x1 + v[2]*x2;
5120        s2 -= v[1]*x1 + v[3]*x2;
5121          v    += 4;
5122     }
5123     /* x = inv_diagonal*x */
5124    x[idt]   = v[0]*s1 + v[2]*s2;
5125    x[1+idt] = v[1]*s1 + v[3]*s2;
5126   }
5127 
5128   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5129   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5130   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5131   PetscFunctionReturn(0);
5132 }
5133 
5134 #undef __FUNCT__
5135 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5136 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5137 {
5138   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5139   IS                iscol=a->col,isrow=a->row;
5140   PetscErrorCode    ierr;
5141   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5142   PetscInt          i,nz;
5143   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5144   const MatScalar   *aa=a->a,*v;
5145   PetscScalar       *x,s1,*t;
5146   const PetscScalar *b;
5147 
5148   PetscFunctionBegin;
5149   if (!n) PetscFunctionReturn(0);
5150 
5151   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5152   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5153   t  = a->solve_work;
5154 
5155   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5156   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5157 
5158   /* forward solve the lower triangular */
5159   t[0] = b[*r++];
5160   for (i=1; i<n; i++) {
5161     v     = aa + ai[i];
5162     vi    = aj + ai[i];
5163     nz    = diag[i] - ai[i];
5164     s1  = b[*r++];
5165     while (nz--) {
5166       s1 -= (*v++)*t[*vi++];
5167     }
5168     t[i] = s1;
5169   }
5170   /* backward solve the upper triangular */
5171   for (i=n-1; i>=0; i--){
5172     v    = aa + diag[i] + 1;
5173     vi   = aj + diag[i] + 1;
5174     nz   = ai[i+1] - diag[i] - 1;
5175     s1 = t[i];
5176     while (nz--) {
5177       s1 -= (*v++)*t[*vi++];
5178     }
5179     x[*c--] = t[i] = aa[diag[i]]*s1;
5180   }
5181 
5182   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5183   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5184   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5185   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5186   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5187   PetscFunctionReturn(0);
5188 }
5189 /*
5190       Special case where the matrix was ILU(0) factored in the natural
5191    ordering. This eliminates the need for the column and row permutation.
5192 */
5193 #undef __FUNCT__
5194 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5195 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5196 {
5197   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5198   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5199   PetscErrorCode    ierr;
5200   const MatScalar   *aa=a->a,*v;
5201   PetscScalar       *x;
5202   const PetscScalar *b;
5203   PetscScalar       s1,x1;
5204   PetscInt          jdx,idt,idx,nz,i;
5205 
5206   PetscFunctionBegin;
5207   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5208   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5209 
5210   /* forward solve the lower triangular */
5211   idx    = 0;
5212   x[0]   = b[0];
5213   for (i=1; i<n; i++) {
5214     v     =  aa      + ai[i];
5215     vi    =  aj      + ai[i];
5216     nz    =  diag[i] - ai[i];
5217     idx   +=  1;
5218     s1  =  b[idx];
5219     while (nz--) {
5220       jdx   = *vi++;
5221       x1    = x[jdx];
5222       s1 -= v[0]*x1;
5223       v    += 1;
5224     }
5225     x[idx]   = s1;
5226   }
5227   /* backward solve the upper triangular */
5228   for (i=n-1; i>=0; i--){
5229     v    = aa + diag[i] + 1;
5230     vi   = aj + diag[i] + 1;
5231     nz   = ai[i+1] - diag[i] - 1;
5232     idt  = i;
5233     s1 = x[idt];
5234     while (nz--) {
5235       idx   = *vi++;
5236       x1    = x[idx];
5237       s1 -= v[0]*x1;
5238       v    += 1;
5239     }
5240     v        = aa +  diag[i];
5241     x[idt]   = v[0]*s1;
5242   }
5243   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5245   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5246   PetscFunctionReturn(0);
5247 }
5248 
5249 /* ----------------------------------------------------------------*/
5250 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5251 
5252 #undef __FUNCT__
5253 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5254 /*
5255    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5256 */
5257 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5258 {
5259   Mat             C=B;
5260   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5261   PetscErrorCode  ierr;
5262   PetscInt        i,j,k,ipvt[15];
5263   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5264   PetscInt        nz,nzL,row;
5265   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5266   const MatScalar *v,*aa=a->a;
5267   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5268 
5269   PetscFunctionBegin;
5270 
5271   /* generate work space needed by the factorization */
5272   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5273   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5274 
5275   for (i=0; i<n; i++){
5276     /* zero rtmp */
5277     /* L part */
5278     nz    = bi[i+1] - bi[i];
5279     bjtmp = bj + bi[i];
5280     for  (j=0; j<nz; j++){
5281       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5282     }
5283 
5284     /* U part */
5285     nz = bdiag[i] - bdiag[i+1];
5286     bjtmp = bj + bdiag[i+1]+1;
5287     for  (j=0; j<nz; j++){
5288       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5289     }
5290 
5291     /* load in initial (unfactored row) */
5292     nz    = ai[i+1] - ai[i];
5293     ajtmp = aj + ai[i];
5294     v     = aa + bs2*ai[i];
5295     for (j=0; j<nz; j++) {
5296       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5297     }
5298 
5299     /* elimination */
5300     bjtmp = bj + bi[i];
5301     nzL   = bi[i+1] - bi[i];
5302     for(k=0;k < nzL;k++) {
5303       row = bjtmp[k];
5304       pc = rtmp + bs2*row;
5305       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5306       if (flg) {
5307         pv = b->a + bs2*bdiag[row];
5308 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5309 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5310 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5311         pv = b->a + bs2*(bdiag[row+1]+1);
5312         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5313         for (j=0; j<nz; j++) {
5314           vv   = rtmp + bs2*pj[j];
5315           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5316 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5317 	  pv  += bs2;
5318         }
5319         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5320       }
5321     }
5322 
5323     /* finished row so stick it into b->a */
5324     /* L part */
5325     pv   = b->a + bs2*bi[i] ;
5326     pj   = b->j + bi[i] ;
5327     nz   = bi[i+1] - bi[i];
5328     for (j=0; j<nz; j++) {
5329       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5330     }
5331 
5332     /* Mark diagonal and invert diagonal for simplier triangular solves */
5333     pv   = b->a + bs2*bdiag[i];
5334     pj   = b->j + bdiag[i];
5335     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5336     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5337     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftinblocks);CHKERRQ(ierr);
5338 
5339     /* U part */
5340     pv = b->a + bs2*(bdiag[i+1]+1);
5341     pj = b->j + bdiag[i+1]+1;
5342     nz = bdiag[i] - bdiag[i+1] - 1;
5343     for (j=0; j<nz; j++){
5344       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5345     }
5346   }
5347 
5348   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5349   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering;
5350   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5351   C->assembled = PETSC_TRUE;
5352   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5353   PetscFunctionReturn(0);
5354 }
5355 
5356 #undef __FUNCT__
5357 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5358 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5359 {
5360   Mat            C=B;
5361   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5362   IS             isrow = b->row,isicol = b->icol;
5363   PetscErrorCode ierr;
5364   const PetscInt *r,*ic,*ics;
5365   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5366   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5367   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5368   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5369   MatScalar      *v_work;
5370   PetscTruth     col_identity,row_identity,both_identity;
5371 
5372   PetscFunctionBegin;
5373   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5374   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5375 
5376   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5377   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5378   ics  = ic;
5379 
5380   /* generate work space needed by dense LU factorization */
5381   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5382 
5383   for (i=0; i<n; i++){
5384     /* zero rtmp */
5385     /* L part */
5386     nz    = bi[i+1] - bi[i];
5387     bjtmp = bj + bi[i];
5388     for  (j=0; j<nz; j++){
5389       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5390     }
5391 
5392     /* U part */
5393     nz = bdiag[i] - bdiag[i+1];
5394     bjtmp = bj + bdiag[i+1]+1;
5395     for  (j=0; j<nz; j++){
5396       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5397     }
5398 
5399     /* load in initial (unfactored row) */
5400     nz    = ai[r[i]+1] - ai[r[i]];
5401     ajtmp = aj + ai[r[i]];
5402     v     = aa + bs2*ai[r[i]];
5403     for (j=0; j<nz; j++) {
5404       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5405     }
5406 
5407     /* elimination */
5408     bjtmp = bj + bi[i];
5409     nzL   = bi[i+1] - bi[i];
5410     for(k=0;k < nzL;k++) {
5411       row = bjtmp[k];
5412       pc = rtmp + bs2*row;
5413       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5414       if (flg) {
5415         pv         = b->a + bs2*bdiag[row];
5416         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5417         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5418         pv         = b->a + bs2*(bdiag[row+1]+1);
5419         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5420         for (j=0; j<nz; j++) {
5421           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5422         }
5423         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5424       }
5425     }
5426 
5427     /* finished row so stick it into b->a */
5428     /* L part */
5429     pv   = b->a + bs2*bi[i] ;
5430     pj   = b->j + bi[i] ;
5431     nz   = bi[i+1] - bi[i];
5432     for (j=0; j<nz; j++) {
5433       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5434     }
5435 
5436     /* Mark diagonal and invert diagonal for simplier triangular solves */
5437     pv  = b->a + bs2*bdiag[i];
5438     pj  = b->j + bdiag[i];
5439     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5440     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5441     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5442 
5443     /* U part */
5444     pv = b->a + bs2*(bdiag[i+1]+1);
5445     pj = b->j + bdiag[i+1]+1;
5446     nz = bdiag[i] - bdiag[i+1] - 1;
5447     for (j=0; j<nz; j++){
5448       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5449     }
5450   }
5451 
5452   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5453   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5454   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5455   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5456 
5457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5458   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5459   both_identity = (PetscTruth) (row_identity && col_identity);
5460   if (both_identity){
5461     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5462   } else {
5463     C->ops->solve = MatSolve_SeqBAIJ_N;
5464   }
5465   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5466 
5467   C->assembled = PETSC_TRUE;
5468   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5469   PetscFunctionReturn(0);
5470 }
5471 
5472 /*
5473    ilu(0) with natural ordering under new data structure.
5474    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5475    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5476 */
5477 
5478 #undef __FUNCT__
5479 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5480 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5481 {
5482 
5483   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5484   PetscErrorCode     ierr;
5485   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5486   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5487 
5488   PetscFunctionBegin;
5489   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5490   b    = (Mat_SeqBAIJ*)(fact)->data;
5491 
5492   /* allocate matrix arrays for new data structure */
5493   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5494   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5495   b->singlemalloc = PETSC_TRUE;
5496   if (!b->diag){
5497     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5498     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5499   }
5500   bdiag = b->diag;
5501 
5502   if (n > 0) {
5503     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5504   }
5505 
5506   /* set bi and bj with new data structure */
5507   bi = b->i;
5508   bj = b->j;
5509 
5510   /* L part */
5511   bi[0] = 0;
5512   for (i=0; i<n; i++){
5513     nz = adiag[i] - ai[i];
5514     bi[i+1] = bi[i] + nz;
5515     aj = a->j + ai[i];
5516     for (j=0; j<nz; j++){
5517       *bj = aj[j]; bj++;
5518     }
5519   }
5520 
5521   /* U part */
5522   bi_temp = bi[n];
5523   bdiag[n] = bi[n]-1;
5524   for (i=n-1; i>=0; i--){
5525     nz = ai[i+1] - adiag[i] - 1;
5526     bi_temp = bi_temp + nz + 1;
5527     aj = a->j + adiag[i] + 1;
5528     for (j=0; j<nz; j++){
5529       *bj = aj[j]; bj++;
5530     }
5531     /* diag[i] */
5532     *bj = i; bj++;
5533     bdiag[i] = bi_temp - 1;
5534   }
5535   PetscFunctionReturn(0);
5536 }
5537 
5538 #undef __FUNCT__
5539 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5540 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5541 {
5542   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5543   IS                 isicol;
5544   PetscErrorCode     ierr;
5545   const PetscInt     *r,*ic;
5546   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5547   PetscInt           *bi,*cols,nnz,*cols_lvl;
5548   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5549   PetscInt           i,levels,diagonal_fill;
5550   PetscTruth         col_identity,row_identity,both_identity;
5551   PetscReal          f;
5552   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5553   PetscBT            lnkbt;
5554   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5555   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5556   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5557   PetscTruth         missing;
5558   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5559 
5560   PetscFunctionBegin;
5561   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5562   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5563   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5564 
5565   f             = info->fill;
5566   levels        = (PetscInt)info->levels;
5567   diagonal_fill = (PetscInt)info->diagonal_fill;
5568   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5569 
5570   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5571   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5572   both_identity = (PetscTruth) (row_identity && col_identity);
5573 
5574   if (!levels && both_identity) {
5575     /* special case: ilu(0) with natural ordering */
5576     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5577     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5578 
5579     fact->factor = MAT_FACTOR_ILU;
5580     (fact)->info.factor_mallocs    = 0;
5581     (fact)->info.fill_ratio_given  = info->fill;
5582     (fact)->info.fill_ratio_needed = 1.0;
5583     b                = (Mat_SeqBAIJ*)(fact)->data;
5584     b->row           = isrow;
5585     b->col           = iscol;
5586     b->icol          = isicol;
5587     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5588     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5589     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5590     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5591     PetscFunctionReturn(0);
5592   }
5593 
5594   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5595   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5596 
5597   /* get new row pointers */
5598   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5599   bi[0] = 0;
5600   /* bdiag is location of diagonal in factor */
5601   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5602   bdiag[0]  = 0;
5603 
5604   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5605 
5606   /* create a linked list for storing column indices of the active row */
5607   nlnk = n + 1;
5608   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5609 
5610   /* initial FreeSpace size is f*(ai[n]+1) */
5611   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5612   current_space = free_space;
5613   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5614   current_space_lvl = free_space_lvl;
5615 
5616   for (i=0; i<n; i++) {
5617     nzi = 0;
5618     /* copy current row into linked list */
5619     nnz  = ai[r[i]+1] - ai[r[i]];
5620     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5621     cols = aj + ai[r[i]];
5622     lnk[i] = -1; /* marker to indicate if diagonal exists */
5623     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5624     nzi += nlnk;
5625 
5626     /* make sure diagonal entry is included */
5627     if (diagonal_fill && lnk[i] == -1) {
5628       fm = n;
5629       while (lnk[fm] < i) fm = lnk[fm];
5630       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5631       lnk[fm]    = i;
5632       lnk_lvl[i] = 0;
5633       nzi++; dcount++;
5634     }
5635 
5636     /* add pivot rows into the active row */
5637     nzbd = 0;
5638     prow = lnk[n];
5639     while (prow < i) {
5640       nnz      = bdiag[prow];
5641       cols     = bj_ptr[prow] + nnz + 1;
5642       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5643       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5644       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5645       nzi += nlnk;
5646       prow = lnk[prow];
5647       nzbd++;
5648     }
5649     bdiag[i] = nzbd;
5650     bi[i+1]  = bi[i] + nzi;
5651 
5652     /* if free space is not available, make more free space */
5653     if (current_space->local_remaining<nzi) {
5654       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5655       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5656       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5657       reallocs++;
5658     }
5659 
5660     /* copy data into free_space and free_space_lvl, then initialize lnk */
5661     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5662     bj_ptr[i]    = current_space->array;
5663     bjlvl_ptr[i] = current_space_lvl->array;
5664 
5665     /* make sure the active row i has diagonal entry */
5666     if (*(bj_ptr[i]+bdiag[i]) != i) {
5667       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5668     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5669     }
5670 
5671     current_space->array           += nzi;
5672     current_space->local_used      += nzi;
5673     current_space->local_remaining -= nzi;
5674     current_space_lvl->array           += nzi;
5675     current_space_lvl->local_used      += nzi;
5676     current_space_lvl->local_remaining -= nzi;
5677   }
5678 
5679   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5680   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5681 
5682   /* destroy list of free space and other temporary arrays */
5683   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5684 
5685   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5686   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5687 
5688   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5689   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5690   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5691 
5692 #if defined(PETSC_USE_INFO)
5693   {
5694     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5695     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5696     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5697     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5698     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5699     if (diagonal_fill) {
5700       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5701     }
5702   }
5703 #endif
5704 
5705   /* put together the new matrix */
5706   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5707   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5708   b = (Mat_SeqBAIJ*)(fact)->data;
5709   b->free_a       = PETSC_TRUE;
5710   b->free_ij      = PETSC_TRUE;
5711   b->singlemalloc = PETSC_FALSE;
5712   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5713   b->j          = bj;
5714   b->i          = bi;
5715   b->diag       = bdiag;
5716   b->free_diag  = PETSC_TRUE;
5717   b->ilen       = 0;
5718   b->imax       = 0;
5719   b->row        = isrow;
5720   b->col        = iscol;
5721   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5722   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5723   b->icol       = isicol;
5724   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5725   /* In b structure:  Free imax, ilen, old a, old j.
5726      Allocate bdiag, solve_work, new a, new j */
5727   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5728   b->maxnz = b->nz = bdiag[0]+1;
5729   fact->info.factor_mallocs    = reallocs;
5730   fact->info.fill_ratio_given  = f;
5731   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5732   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5733   PetscFunctionReturn(0);
5734 }
5735 
5736 
5737 /*
5738      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5739    except that the data structure of Mat_SeqAIJ is slightly different.
5740    Not a good example of code reuse.
5741 */
5742 #undef __FUNCT__
5743 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5744 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5745 {
5746   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5747   IS             isicol;
5748   PetscErrorCode ierr;
5749   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5750   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5751   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5752   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5753   PetscTruth     col_identity,row_identity,both_identity,flg;
5754   PetscReal      f;
5755 
5756   PetscFunctionBegin;
5757   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5758   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5759 
5760   f             = info->fill;
5761   levels        = (PetscInt)info->levels;
5762   diagonal_fill = (PetscInt)info->diagonal_fill;
5763   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5764 
5765   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5766   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5767   both_identity = (PetscTruth) (row_identity && col_identity);
5768 
5769   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5770     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5771     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5772 
5773     fact->factor = MAT_FACTOR_ILU;
5774     b            = (Mat_SeqBAIJ*)fact->data;
5775     b->row       = isrow;
5776     b->col       = iscol;
5777     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5778     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5779     b->icol      = isicol;
5780     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5781     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5782     PetscFunctionReturn(0);
5783   }
5784 
5785   /* general case perform the symbolic factorization */
5786     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5787     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5788 
5789     /* get new row pointers */
5790     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5791     ainew[0] = 0;
5792     /* don't know how many column pointers are needed so estimate */
5793     jmax = (PetscInt)(f*ai[n] + 1);
5794     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5795     /* ajfill is level of fill for each fill entry */
5796     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5797     /* fill is a linked list of nonzeros in active row */
5798     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5799     /* im is level for each filled value */
5800     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5801     /* dloc is location of diagonal in factor */
5802     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5803     dloc[0]  = 0;
5804     for (prow=0; prow<n; prow++) {
5805 
5806       /* copy prow into linked list */
5807       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5808       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5809       xi         = aj + ai[r[prow]];
5810       fill[n]    = n;
5811       fill[prow] = -1; /* marker for diagonal entry */
5812       while (nz--) {
5813 	fm  = n;
5814 	idx = ic[*xi++];
5815 	do {
5816 	  m  = fm;
5817 	  fm = fill[m];
5818 	} while (fm < idx);
5819 	fill[m]   = idx;
5820 	fill[idx] = fm;
5821 	im[idx]   = 0;
5822       }
5823 
5824       /* make sure diagonal entry is included */
5825       if (diagonal_fill && fill[prow] == -1) {
5826 	fm = n;
5827 	while (fill[fm] < prow) fm = fill[fm];
5828 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5829 	fill[fm]   = prow;
5830 	im[prow]   = 0;
5831 	nzf++;
5832 	dcount++;
5833       }
5834 
5835       nzi = 0;
5836       row = fill[n];
5837       while (row < prow) {
5838 	incrlev = im[row] + 1;
5839 	nz      = dloc[row];
5840 	xi      = ajnew  + ainew[row] + nz + 1;
5841 	flev    = ajfill + ainew[row] + nz + 1;
5842 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5843 	fm      = row;
5844 	while (nnz-- > 0) {
5845 	  idx = *xi++;
5846 	  if (*flev + incrlev > levels) {
5847 	    flev++;
5848 	    continue;
5849 	  }
5850 	  do {
5851 	    m  = fm;
5852 	    fm = fill[m];
5853 	  } while (fm < idx);
5854 	  if (fm != idx) {
5855 	    im[idx]   = *flev + incrlev;
5856 	    fill[m]   = idx;
5857 	    fill[idx] = fm;
5858 	    fm        = idx;
5859 	    nzf++;
5860 	  } else {
5861 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5862 	  }
5863 	  flev++;
5864 	}
5865 	row = fill[row];
5866 	nzi++;
5867       }
5868       /* copy new filled row into permanent storage */
5869       ainew[prow+1] = ainew[prow] + nzf;
5870       if (ainew[prow+1] > jmax) {
5871 
5872 	/* estimate how much additional space we will need */
5873 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5874 	/* just double the memory each time */
5875 	PetscInt maxadd = jmax;
5876 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5877 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5878 	jmax += maxadd;
5879 
5880 	/* allocate a longer ajnew and ajfill */
5881 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5882 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5883 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5884 	ajnew = xitmp;
5885 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5886 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5887 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5888 	ajfill = xitmp;
5889 	reallocate++; /* count how many reallocations are needed */
5890       }
5891       xitmp       = ajnew + ainew[prow];
5892       flev        = ajfill + ainew[prow];
5893       dloc[prow]  = nzi;
5894       fm          = fill[n];
5895       while (nzf--) {
5896 	*xitmp++ = fm;
5897 	*flev++ = im[fm];
5898 	fm      = fill[fm];
5899       }
5900       /* make sure row has diagonal entry */
5901       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5902 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5903     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5904       }
5905     }
5906     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5907     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5908     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5909     ierr = PetscFree(fill);CHKERRQ(ierr);
5910     ierr = PetscFree(im);CHKERRQ(ierr);
5911 
5912 #if defined(PETSC_USE_INFO)
5913     {
5914       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5915       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5916       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5917       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5918       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5919       if (diagonal_fill) {
5920 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5921       }
5922     }
5923 #endif
5924 
5925     /* put together the new matrix */
5926     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5927     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5928     b    = (Mat_SeqBAIJ*)fact->data;
5929     b->free_a       = PETSC_TRUE;
5930     b->free_ij      = PETSC_TRUE;
5931     b->singlemalloc = PETSC_FALSE;
5932     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5933     b->j          = ajnew;
5934     b->i          = ainew;
5935     for (i=0; i<n; i++) dloc[i] += ainew[i];
5936     b->diag       = dloc;
5937     b->free_diag  = PETSC_TRUE;
5938     b->ilen       = 0;
5939     b->imax       = 0;
5940     b->row        = isrow;
5941     b->col        = iscol;
5942     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5943     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5944     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5945     b->icol       = isicol;
5946     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5947     /* In b structure:  Free imax, ilen, old a, old j.
5948        Allocate dloc, solve_work, new a, new j */
5949     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5950     b->maxnz          = b->nz = ainew[n];
5951 
5952     fact->info.factor_mallocs    = reallocate;
5953     fact->info.fill_ratio_given  = f;
5954     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5955 
5956   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5957   PetscFunctionReturn(0);
5958 }
5959 
5960 #undef __FUNCT__
5961 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5962 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5963 {
5964   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5965   /* int i,*AJ=a->j,nz=a->nz; */
5966   PetscFunctionBegin;
5967   /* Undo Column scaling */
5968 /*    while (nz--) { */
5969 /*      AJ[i] = AJ[i]/4; */
5970 /*    } */
5971   /* This should really invoke a push/pop logic, but we don't have that yet. */
5972   A->ops->setunfactored = PETSC_NULL;
5973   PetscFunctionReturn(0);
5974 }
5975 
5976 #undef __FUNCT__
5977 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5978 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5979 {
5980   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5981   PetscInt       *AJ=a->j,nz=a->nz;
5982   unsigned short *aj=(unsigned short *)AJ;
5983   PetscFunctionBegin;
5984   /* Is this really necessary? */
5985   while (nz--) {
5986     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5987   }
5988   A->ops->setunfactored = PETSC_NULL;
5989   PetscFunctionReturn(0);
5990 }
5991 
5992 
5993