xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 766f9fba423f819039ab9d04da23ecbf6f0f0d3f)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode    ierr;
18   PetscInt          i,nz;
19   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20   const MatScalar   *aa=a->a,*v;
21   PetscScalar       s1,*x;
22   const PetscScalar *b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt       nz,idx,idt,j,i,oidx;
125   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126   MatScalar      *aa=a->a,*v;
127   PetscScalar    s1,s2,x1,x2;
128   PetscScalar    *x,*b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,x1,x2,x3;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode ierr;
244   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt       nz,idx,idt,j,i,oidx;
246   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
247   MatScalar      *aa=a->a,*v;
248   PetscScalar    s1,s2,s3,x1,x2,x3;
249   PetscScalar    *x,*b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode ierr;
306   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   PetscInt       *diag = a->diag,oidx;
308   MatScalar      *aa=a->a,*v;
309   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
310   PetscScalar    *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode ierr;
371   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt       nz,idx,idt,j,i,oidx;
373   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
374   MatScalar      *aa=a->a,*v;
375   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
376   PetscScalar    *x,*b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode ierr;
436   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437   PetscInt       *diag = a->diag,oidx;
438   MatScalar      *aa=a->a,*v;
439   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
440   PetscScalar    *x,*b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
509   PetscScalar    *x,*b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode ierr;
573   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574   PetscInt       *diag = a->diag,oidx;
575   MatScalar      *aa=a->a,*v;
576   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
577   PetscScalar    *x,*b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode ierr;
647   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt       nz,idx,idt,j,i,oidx;
649   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
650   MatScalar      *aa=a->a,*v;
651   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
652   PetscScalar    *x,*b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode ierr;
721   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722   PetscInt       *diag = a->diag,oidx;
723   MatScalar      *aa=a->a,*v;
724   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
725   PetscScalar    *x,*b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode ierr;
797   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt       nz,idx,idt,j,i,oidx;
799   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
802   PetscScalar    *x,*b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873   IS             iscol=a->col,isrow=a->row;
874   PetscErrorCode ierr;
875   const PetscInt *r,*c,*rout,*cout;
876   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877   PetscInt       *diag = a->diag;
878   MatScalar      *aa=a->a,*v;
879   PetscScalar    s1,*x,*b,*t;
880 
881   PetscFunctionBegin;
882   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
883   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884   t  = a->solve_work;
885 
886   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888 
889   /* copy the b into temp work space according to permutation */
890   for (i=0; i<n; i++) {
891     t[i] = b[c[i]];
892   }
893 
894   /* forward solve the U^T */
895   for (i=0; i<n; i++) {
896 
897     v     = aa + diag[i];
898     /* multiply by the inverse of the block diagonal */
899     s1    = (*v++)*t[i];
900     vi    = aj + diag[i] + 1;
901     nz    = ai[i+1] - diag[i] - 1;
902     while (nz--) {
903       t[*vi++]  -= (*v++)*s1;
904     }
905     t[i]   = s1;
906   }
907   /* backward solve the L^T */
908   for (i=n-1; i>=0; i--){
909     v    = aa + diag[i] - 1;
910     vi   = aj + diag[i] - 1;
911     nz   = diag[i] - ai[i];
912     s1   = t[i];
913     while (nz--) {
914       t[*vi--]   -=  (*v--)*s1;
915     }
916   }
917 
918   /* copy t into x according to permutation */
919   for (i=0; i<n; i++) {
920     x[r[i]]   = t[i];
921   }
922 
923   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
925   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
926   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 #undef __FUNCT__
932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
934 {
935   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936   IS             iscol=a->col,isrow=a->row;
937   PetscErrorCode ierr;
938   const PetscInt *r,*c,*rout,*cout;
939   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941   MatScalar      *aa=a->a,*v;
942   PetscScalar    s1,s2,x1,x2;
943   PetscScalar    *x,*b,*t;
944 
945   PetscFunctionBegin;
946   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
947   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948   t  = a->solve_work;
949 
950   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952 
953   /* copy the b into temp work space according to permutation */
954   ii = 0;
955   for (i=0; i<n; i++) {
956     ic      = 2*c[i];
957     t[ii]   = b[ic];
958     t[ii+1] = b[ic+1];
959     ii += 2;
960   }
961 
962   /* forward solve the U^T */
963   idx = 0;
964   for (i=0; i<n; i++) {
965 
966     v     = aa + 4*diag[i];
967     /* multiply by the inverse of the block diagonal */
968     x1    = t[idx];   x2 = t[1+idx];
969     s1 = v[0]*x1  +  v[1]*x2;
970     s2 = v[2]*x1  +  v[3]*x2;
971     v += 4;
972 
973     vi    = aj + diag[i] + 1;
974     nz    = ai[i+1] - diag[i] - 1;
975     while (nz--) {
976       oidx = 2*(*vi++);
977       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979       v  += 4;
980     }
981     t[idx]   = s1;t[1+idx] = s2;
982     idx += 2;
983   }
984   /* backward solve the L^T */
985   for (i=n-1; i>=0; i--){
986     v    = aa + 4*diag[i] - 4;
987     vi   = aj + diag[i] - 1;
988     nz   = diag[i] - ai[i];
989     idt  = 2*i;
990     s1 = t[idt];  s2 = t[1+idt];
991     while (nz--) {
992       idx   = 2*(*vi--);
993       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995       v -= 4;
996     }
997   }
998 
999   /* copy t into x according to permutation */
1000   ii = 0;
1001   for (i=0; i<n; i++) {
1002     ir      = 2*r[i];
1003     x[ir]   = t[ii];
1004     x[ir+1] = t[ii+1];
1005     ii += 2;
1006   }
1007 
1008   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 #undef __FUNCT__
1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1019 {
1020   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1021   PetscErrorCode ierr;
1022   IS             iscol=a->col,isrow=a->row;
1023   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1024   const PetscInt *r,*c,*rout,*cout;
1025   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1026   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1027   MatScalar      *aa=a->a,*v;
1028   PetscScalar    s1,s2,x1,x2;
1029   PetscScalar    *x,*b,*t;
1030 
1031   PetscFunctionBegin;
1032   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1034   t = a->solve_work;
1035 
1036   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1037   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1038 
1039   /* copy b into temp work space according to permutation */
1040   for(i=0;i<n;i++){
1041     ii = bs*i; ic = bs*c[i];
1042     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1043   }
1044 
1045   /* forward solve the U^T */
1046   idx = 0;
1047   for (i=0; i<n; i++) {
1048     v     = aa + bs2*diag[i];
1049     /* multiply by the inverse of the block diagonal */
1050     x1 = t[idx];   x2 = t[1+idx];
1051     s1 = v[0]*x1  +  v[1]*x2;
1052     s2 = v[2]*x1  +  v[3]*x2;
1053     v -= bs2;
1054 
1055     vi    = aj + diag[i] - 1;
1056     nz    = diag[i] - diag[i+1] - 1;
1057     for(j=0;j>-nz;j--){
1058       oidx = bs*vi[j];
1059       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1060       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1061       v  -= bs2;
1062     }
1063     t[idx]   = s1;t[1+idx] = s2;
1064     idx += bs;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + bs2*ai[i];
1069     vi   = aj + ai[i];
1070     nz   = ai[i+1] - ai[i];
1071     idt  = bs*i;
1072     s1   = t[idt];  s2 = t[1+idt];
1073     for(j=0;j<nz;j++){
1074       idx   = bs*vi[j];
1075       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1076       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1077       v += bs2;
1078     }
1079   }
1080 
1081   /* copy t into x according to permutation */
1082   for(i=0;i<n;i++){
1083     ii = bs*i;  ir = bs*r[i];
1084     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1097 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1098 {
1099   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1100   IS             iscol=a->col,isrow=a->row;
1101   PetscErrorCode ierr;
1102   const PetscInt *r,*c,*rout,*cout;
1103   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1104   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1105   MatScalar      *aa=a->a,*v;
1106   PetscScalar    s1,s2,s3,x1,x2,x3;
1107   PetscScalar    *x,*b,*t;
1108 
1109   PetscFunctionBegin;
1110   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1112   t  = a->solve_work;
1113 
1114   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1115   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1116 
1117   /* copy the b into temp work space according to permutation */
1118   ii = 0;
1119   for (i=0; i<n; i++) {
1120     ic      = 3*c[i];
1121     t[ii]   = b[ic];
1122     t[ii+1] = b[ic+1];
1123     t[ii+2] = b[ic+2];
1124     ii += 3;
1125   }
1126 
1127   /* forward solve the U^T */
1128   idx = 0;
1129   for (i=0; i<n; i++) {
1130 
1131     v     = aa + 9*diag[i];
1132     /* multiply by the inverse of the block diagonal */
1133     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1137     v += 9;
1138 
1139     vi    = aj + diag[i] + 1;
1140     nz    = ai[i+1] - diag[i] - 1;
1141     while (nz--) {
1142       oidx = 3*(*vi++);
1143       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1144       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1145       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1146       v  += 9;
1147     }
1148     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1149     idx += 3;
1150   }
1151   /* backward solve the L^T */
1152   for (i=n-1; i>=0; i--){
1153     v    = aa + 9*diag[i] - 9;
1154     vi   = aj + diag[i] - 1;
1155     nz   = diag[i] - ai[i];
1156     idt  = 3*i;
1157     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1158     while (nz--) {
1159       idx   = 3*(*vi--);
1160       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1161       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1162       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1163       v -= 9;
1164     }
1165   }
1166 
1167   /* copy t into x according to permutation */
1168   ii = 0;
1169   for (i=0; i<n; i++) {
1170     ir      = 3*r[i];
1171     x[ir]   = t[ii];
1172     x[ir+1] = t[ii+1];
1173     x[ir+2] = t[ii+2];
1174     ii += 3;
1175   }
1176 
1177   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1178   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1181   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1187 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1188 {
1189   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1190   PetscErrorCode ierr;
1191   IS             iscol=a->col,isrow=a->row;
1192   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1193   const PetscInt *r,*c,*rout,*cout;
1194   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1195   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1196   MatScalar      *aa=a->a,*v;
1197   PetscScalar    s1,s2,s3,x1,x2,x3;
1198   PetscScalar    *x,*b,*t;
1199 
1200   PetscFunctionBegin;
1201   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1203   t = a->solve_work;
1204 
1205   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1206   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1207 
1208   /* copy b into temp work space according to permutation */
1209   for(i=0;i<n;i++){
1210     ii = bs*i; ic = bs*c[i];
1211     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1212   }
1213 
1214   /* forward solve the U^T */
1215   idx = 0;
1216   for (i=0; i<n; i++) {
1217     v     = aa + bs2*diag[i];
1218     /* multiply by the inverse of the block diagonal */
1219     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1220     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1221     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1222     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1223     v -= bs2;
1224 
1225     vi    = aj + diag[i] - 1;
1226     nz    = diag[i] - diag[i+1] - 1;
1227     for(j=0;j>-nz;j--){
1228       oidx = bs*vi[j];
1229       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1230       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1231       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1232       v  -= bs2;
1233     }
1234     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1235     idx += bs;
1236   }
1237   /* backward solve the L^T */
1238   for (i=n-1; i>=0; i--){
1239     v    = aa + bs2*ai[i];
1240     vi   = aj + ai[i];
1241     nz   = ai[i+1] - ai[i];
1242     idt  = bs*i;
1243     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1244     for(j=0;j<nz;j++){
1245       idx   = bs*vi[j];
1246       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1247       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1248       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1249       v += bs2;
1250     }
1251   }
1252 
1253   /* copy t into x according to permutation */
1254   for(i=0;i<n;i++){
1255     ii = bs*i;  ir = bs*r[i];
1256     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1257   }
1258 
1259   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1260   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1261   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1263   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #undef __FUNCT__
1268 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1269 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1270 {
1271   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1272   IS             iscol=a->col,isrow=a->row;
1273   PetscErrorCode ierr;
1274   const PetscInt *r,*c,*rout,*cout;
1275   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1276   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1277   MatScalar      *aa=a->a,*v;
1278   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1279   PetscScalar    *x,*b,*t;
1280 
1281   PetscFunctionBegin;
1282   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1283   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1284   t  = a->solve_work;
1285 
1286   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1287   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1288 
1289   /* copy the b into temp work space according to permutation */
1290   ii = 0;
1291   for (i=0; i<n; i++) {
1292     ic      = 4*c[i];
1293     t[ii]   = b[ic];
1294     t[ii+1] = b[ic+1];
1295     t[ii+2] = b[ic+2];
1296     t[ii+3] = b[ic+3];
1297     ii += 4;
1298   }
1299 
1300   /* forward solve the U^T */
1301   idx = 0;
1302   for (i=0; i<n; i++) {
1303 
1304     v     = aa + 16*diag[i];
1305     /* multiply by the inverse of the block diagonal */
1306     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1307     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1308     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1309     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1310     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1311     v += 16;
1312 
1313     vi    = aj + diag[i] + 1;
1314     nz    = ai[i+1] - diag[i] - 1;
1315     while (nz--) {
1316       oidx = 4*(*vi++);
1317       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1318       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1319       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1320       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1321       v  += 16;
1322     }
1323     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1324     idx += 4;
1325   }
1326   /* backward solve the L^T */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 16*diag[i] - 16;
1329     vi   = aj + diag[i] - 1;
1330     nz   = diag[i] - ai[i];
1331     idt  = 4*i;
1332     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1333     while (nz--) {
1334       idx   = 4*(*vi--);
1335       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1336       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1337       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1338       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1339       v -= 16;
1340     }
1341   }
1342 
1343   /* copy t into x according to permutation */
1344   ii = 0;
1345   for (i=0; i<n; i++) {
1346     ir      = 4*r[i];
1347     x[ir]   = t[ii];
1348     x[ir+1] = t[ii+1];
1349     x[ir+2] = t[ii+2];
1350     x[ir+3] = t[ii+3];
1351     ii += 4;
1352   }
1353 
1354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1356   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1364 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1365 {
1366   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1367   PetscErrorCode ierr;
1368   IS             iscol=a->col,isrow=a->row;
1369   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1370   const PetscInt *r,*c,*rout,*cout;
1371   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1372   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1373   MatScalar      *aa=a->a,*v;
1374   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1375   PetscScalar    *x,*b,*t;
1376 
1377   PetscFunctionBegin;
1378   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   t = a->solve_work;
1381 
1382   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1383   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1384 
1385   /* copy b into temp work space according to permutation */
1386   for(i=0;i<n;i++){
1387     ii = bs*i; ic = bs*c[i];
1388     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1389   }
1390 
1391   /* forward solve the U^T */
1392   idx = 0;
1393   for (i=0; i<n; i++) {
1394     v     = aa + bs2*diag[i];
1395     /* multiply by the inverse of the block diagonal */
1396     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1397     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1398     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1399     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1400     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1401     v -= bs2;
1402 
1403     vi    = aj + diag[i] - 1;
1404     nz    = diag[i] - diag[i+1] - 1;
1405     for(j=0;j>-nz;j--){
1406       oidx = bs*vi[j];
1407       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1408       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1409       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1410       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1411       v  -= bs2;
1412     }
1413     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1414     idx += bs;
1415   }
1416   /* backward solve the L^T */
1417   for (i=n-1; i>=0; i--){
1418     v    = aa + bs2*ai[i];
1419     vi   = aj + ai[i];
1420     nz   = ai[i+1] - ai[i];
1421     idt  = bs*i;
1422     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1423     for(j=0;j<nz;j++){
1424       idx   = bs*vi[j];
1425       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1426       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1427       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1428       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1429       v += bs2;
1430     }
1431   }
1432 
1433   /* copy t into x according to permutation */
1434   for(i=0;i<n;i++){
1435     ii = bs*i;  ir = bs*r[i];
1436     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1437   }
1438 
1439   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1440   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1443   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1449 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1450 {
1451   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1452   IS             iscol=a->col,isrow=a->row;
1453   PetscErrorCode ierr;
1454   const PetscInt *r,*c,*rout,*cout;
1455   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1456   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1457   MatScalar      *aa=a->a,*v;
1458   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1459   PetscScalar    *x,*b,*t;
1460 
1461   PetscFunctionBegin;
1462   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1464   t  = a->solve_work;
1465 
1466   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1467   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1468 
1469   /* copy the b into temp work space according to permutation */
1470   ii = 0;
1471   for (i=0; i<n; i++) {
1472     ic      = 5*c[i];
1473     t[ii]   = b[ic];
1474     t[ii+1] = b[ic+1];
1475     t[ii+2] = b[ic+2];
1476     t[ii+3] = b[ic+3];
1477     t[ii+4] = b[ic+4];
1478     ii += 5;
1479   }
1480 
1481   /* forward solve the U^T */
1482   idx = 0;
1483   for (i=0; i<n; i++) {
1484 
1485     v     = aa + 25*diag[i];
1486     /* multiply by the inverse of the block diagonal */
1487     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1488     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1489     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1490     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1491     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1492     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1493     v += 25;
1494 
1495     vi    = aj + diag[i] + 1;
1496     nz    = ai[i+1] - diag[i] - 1;
1497     while (nz--) {
1498       oidx = 5*(*vi++);
1499       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1500       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1501       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1502       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1503       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1504       v  += 25;
1505     }
1506     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1507     idx += 5;
1508   }
1509   /* backward solve the L^T */
1510   for (i=n-1; i>=0; i--){
1511     v    = aa + 25*diag[i] - 25;
1512     vi   = aj + diag[i] - 1;
1513     nz   = diag[i] - ai[i];
1514     idt  = 5*i;
1515     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1516     while (nz--) {
1517       idx   = 5*(*vi--);
1518       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1519       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1520       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1521       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1522       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1523       v -= 25;
1524     }
1525   }
1526 
1527   /* copy t into x according to permutation */
1528   ii = 0;
1529   for (i=0; i<n; i++) {
1530     ir      = 5*r[i];
1531     x[ir]   = t[ii];
1532     x[ir+1] = t[ii+1];
1533     x[ir+2] = t[ii+2];
1534     x[ir+3] = t[ii+3];
1535     x[ir+4] = t[ii+4];
1536     ii += 5;
1537   }
1538 
1539   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1540   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1541   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1543   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 #undef __FUNCT__
1548 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1549 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1550 {
1551   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1552   PetscErrorCode ierr;
1553   IS             iscol=a->col,isrow=a->row;
1554   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1555   const PetscInt *r,*c,*rout,*cout;
1556   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1557   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1558   MatScalar      *aa=a->a,*v;
1559   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1560   PetscScalar    *x,*b,*t;
1561 
1562   PetscFunctionBegin;
1563   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565   t = a->solve_work;
1566 
1567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1569 
1570   /* copy b into temp work space according to permutation */
1571   for(i=0;i<n;i++){
1572     ii = bs*i; ic = bs*c[i];
1573     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1574     t[ii+4] = b[ic+4];
1575   }
1576 
1577   /* forward solve the U^T */
1578   idx = 0;
1579   for (i=0; i<n; i++) {
1580     v     = aa + bs2*diag[i];
1581     /* multiply by the inverse of the block diagonal */
1582     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1583     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1584     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1585     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1586     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1587     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1588     v -= bs2;
1589 
1590     vi    = aj + diag[i] - 1;
1591     nz    = diag[i] - diag[i+1] - 1;
1592     for(j=0;j>-nz;j--){
1593       oidx = bs*vi[j];
1594       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1595       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1596       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1597       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1598       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1599       v  -= bs2;
1600     }
1601     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1602     idx += bs;
1603   }
1604   /* backward solve the L^T */
1605   for (i=n-1; i>=0; i--){
1606     v    = aa + bs2*ai[i];
1607     vi   = aj + ai[i];
1608     nz   = ai[i+1] - ai[i];
1609     idt  = bs*i;
1610     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1611     for(j=0;j<nz;j++){
1612       idx   = bs*vi[j];
1613       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1614       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1615       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1616       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1617       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1618       v += bs2;
1619     }
1620   }
1621 
1622   /* copy t into x according to permutation */
1623   for(i=0;i<n;i++){
1624     ii = bs*i;  ir = bs*r[i];
1625     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1626     x[ir+4] = t[ii+4];
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1639 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1640 {
1641   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1642   IS             iscol=a->col,isrow=a->row;
1643   PetscErrorCode ierr;
1644   const PetscInt *r,*c,*rout,*cout;
1645   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1646   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1647   MatScalar      *aa=a->a,*v;
1648   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1649   PetscScalar    *x,*b,*t;
1650 
1651   PetscFunctionBegin;
1652   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1653   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654   t  = a->solve_work;
1655 
1656   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1657   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1658 
1659   /* copy the b into temp work space according to permutation */
1660   ii = 0;
1661   for (i=0; i<n; i++) {
1662     ic      = 6*c[i];
1663     t[ii]   = b[ic];
1664     t[ii+1] = b[ic+1];
1665     t[ii+2] = b[ic+2];
1666     t[ii+3] = b[ic+3];
1667     t[ii+4] = b[ic+4];
1668     t[ii+5] = b[ic+5];
1669     ii += 6;
1670   }
1671 
1672   /* forward solve the U^T */
1673   idx = 0;
1674   for (i=0; i<n; i++) {
1675 
1676     v     = aa + 36*diag[i];
1677     /* multiply by the inverse of the block diagonal */
1678     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1679     x6    = t[5+idx];
1680     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1681     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1682     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1683     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1684     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1685     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1686     v += 36;
1687 
1688     vi    = aj + diag[i] + 1;
1689     nz    = ai[i+1] - diag[i] - 1;
1690     while (nz--) {
1691       oidx = 6*(*vi++);
1692       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1693       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1694       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1695       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1696       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1697       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1698       v  += 36;
1699     }
1700     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1701     t[5+idx] = s6;
1702     idx += 6;
1703   }
1704   /* backward solve the L^T */
1705   for (i=n-1; i>=0; i--){
1706     v    = aa + 36*diag[i] - 36;
1707     vi   = aj + diag[i] - 1;
1708     nz   = diag[i] - ai[i];
1709     idt  = 6*i;
1710     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1711     s6 = t[5+idt];
1712     while (nz--) {
1713       idx   = 6*(*vi--);
1714       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1715       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1716       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1717       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1718       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1719       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1720       v -= 36;
1721     }
1722   }
1723 
1724   /* copy t into x according to permutation */
1725   ii = 0;
1726   for (i=0; i<n; i++) {
1727     ir      = 6*r[i];
1728     x[ir]   = t[ii];
1729     x[ir+1] = t[ii+1];
1730     x[ir+2] = t[ii+2];
1731     x[ir+3] = t[ii+3];
1732     x[ir+4] = t[ii+4];
1733     x[ir+5] = t[ii+5];
1734     ii += 6;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 #undef __FUNCT__
1746 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1747 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1748 {
1749   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1750   PetscErrorCode ierr;
1751   IS             iscol=a->col,isrow=a->row;
1752   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1753   const PetscInt *r,*c,*rout,*cout;
1754   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1755   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1756   MatScalar      *aa=a->a,*v;
1757   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1758   PetscScalar    *x,*b,*t;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* copy b into temp work space according to permutation */
1769   for(i=0;i<n;i++){
1770     ii = bs*i; ic = bs*c[i];
1771     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1772     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1773   }
1774 
1775   /* forward solve the U^T */
1776   idx = 0;
1777   for (i=0; i<n; i++) {
1778     v     = aa + bs2*diag[i];
1779     /* multiply by the inverse of the block diagonal */
1780     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1781     x6    = t[5+idx];
1782     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1783     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1784     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1785     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1786     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1787     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1788     v -= bs2;
1789 
1790     vi    = aj + diag[i] - 1;
1791     nz    = diag[i] - diag[i+1] - 1;
1792     for(j=0;j>-nz;j--){
1793       oidx = bs*vi[j];
1794       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1795       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1796       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1797       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1798       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1799       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1800       v  -= bs2;
1801     }
1802     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1803     t[5+idx] = s6;
1804     idx += bs;
1805   }
1806   /* backward solve the L^T */
1807   for (i=n-1; i>=0; i--){
1808     v    = aa + bs2*ai[i];
1809     vi   = aj + ai[i];
1810     nz   = ai[i+1] - ai[i];
1811     idt  = bs*i;
1812     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1813     s6   = t[5+idt];
1814    for(j=0;j<nz;j++){
1815       idx   = bs*vi[j];
1816       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1817       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1818       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1819       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1820       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1821       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1822       v += bs2;
1823     }
1824   }
1825 
1826   /* copy t into x according to permutation */
1827   for(i=0;i<n;i++){
1828     ii = bs*i;  ir = bs*r[i];
1829     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1830     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1831   }
1832 
1833   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1834   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1835   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1837   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 #undef __FUNCT__
1842 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1843 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1844 {
1845   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1846   IS             iscol=a->col,isrow=a->row;
1847   PetscErrorCode ierr;
1848   const PetscInt *r,*c,*rout,*cout;
1849   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1851   MatScalar      *aa=a->a,*v;
1852   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1853   PetscScalar    *x,*b,*t;
1854 
1855   PetscFunctionBegin;
1856   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858   t  = a->solve_work;
1859 
1860   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862 
1863   /* copy the b into temp work space according to permutation */
1864   ii = 0;
1865   for (i=0; i<n; i++) {
1866     ic      = 7*c[i];
1867     t[ii]   = b[ic];
1868     t[ii+1] = b[ic+1];
1869     t[ii+2] = b[ic+2];
1870     t[ii+3] = b[ic+3];
1871     t[ii+4] = b[ic+4];
1872     t[ii+5] = b[ic+5];
1873     t[ii+6] = b[ic+6];
1874     ii += 7;
1875   }
1876 
1877   /* forward solve the U^T */
1878   idx = 0;
1879   for (i=0; i<n; i++) {
1880 
1881     v     = aa + 49*diag[i];
1882     /* multiply by the inverse of the block diagonal */
1883     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1884     x6    = t[5+idx]; x7 = t[6+idx];
1885     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1886     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1887     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1888     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1889     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1890     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1891     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1892     v += 49;
1893 
1894     vi    = aj + diag[i] + 1;
1895     nz    = ai[i+1] - diag[i] - 1;
1896     while (nz--) {
1897       oidx = 7*(*vi++);
1898       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1899       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1900       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1901       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1902       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1903       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1904       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1905       v  += 49;
1906     }
1907     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1908     t[5+idx] = s6;t[6+idx] = s7;
1909     idx += 7;
1910   }
1911   /* backward solve the L^T */
1912   for (i=n-1; i>=0; i--){
1913     v    = aa + 49*diag[i] - 49;
1914     vi   = aj + diag[i] - 1;
1915     nz   = diag[i] - ai[i];
1916     idt  = 7*i;
1917     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1918     s6 = t[5+idt];s7 = t[6+idt];
1919     while (nz--) {
1920       idx   = 7*(*vi--);
1921       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1922       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1923       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1924       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1925       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1926       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1927       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1928       v -= 49;
1929     }
1930   }
1931 
1932   /* copy t into x according to permutation */
1933   ii = 0;
1934   for (i=0; i<n; i++) {
1935     ir      = 7*r[i];
1936     x[ir]   = t[ii];
1937     x[ir+1] = t[ii+1];
1938     x[ir+2] = t[ii+2];
1939     x[ir+3] = t[ii+3];
1940     x[ir+4] = t[ii+4];
1941     x[ir+5] = t[ii+5];
1942     x[ir+6] = t[ii+6];
1943     ii += 7;
1944   }
1945 
1946   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1951   PetscFunctionReturn(0);
1952 }
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1955 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1956 {
1957   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1958   PetscErrorCode ierr;
1959   IS             iscol=a->col,isrow=a->row;
1960   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1961   const PetscInt *r,*c,*rout,*cout;
1962   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1963   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1964   MatScalar      *aa=a->a,*v;
1965   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1966   PetscScalar    *x,*b,*t;
1967 
1968   PetscFunctionBegin;
1969   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1971   t = a->solve_work;
1972 
1973   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1974   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1975 
1976   /* copy b into temp work space according to permutation */
1977   for(i=0;i<n;i++){
1978     ii = bs*i; ic = bs*c[i];
1979     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1980     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1981   }
1982 
1983   /* forward solve the U^T */
1984   idx = 0;
1985   for (i=0; i<n; i++) {
1986     v     = aa + bs2*diag[i];
1987     /* multiply by the inverse of the block diagonal */
1988     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1989     x6    = t[5+idx]; x7 = t[6+idx];
1990     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1991     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1992     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1993     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1994     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1995     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1996     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1997     v -= bs2;
1998 
1999     vi    = aj + diag[i] - 1;
2000     nz    = diag[i] - diag[i+1] - 1;
2001     for(j=0;j>-nz;j--){
2002       oidx = bs*vi[j];
2003       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2004       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2005       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2006       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2007       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2008       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2009       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2010       v  -= bs2;
2011     }
2012     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2013     t[5+idx] = s6;  t[6+idx] = s7;
2014     idx += bs;
2015   }
2016   /* backward solve the L^T */
2017   for (i=n-1; i>=0; i--){
2018     v    = aa + bs2*ai[i];
2019     vi   = aj + ai[i];
2020     nz   = ai[i+1] - ai[i];
2021     idt  = bs*i;
2022     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2023     s6   = t[5+idt];  s7 = t[6+idt];
2024    for(j=0;j<nz;j++){
2025       idx   = bs*vi[j];
2026       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2027       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2028       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2029       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2030       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2031       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2032       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2033       v += bs2;
2034     }
2035   }
2036 
2037   /* copy t into x according to permutation */
2038   for(i=0;i<n;i++){
2039     ii = bs*i;  ir = bs*r[i];
2040     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2041     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2042   }
2043 
2044   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2045   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2046   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2048   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 /* ----------------------------------------------------------- */
2053 #undef __FUNCT__
2054 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2055 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2056 {
2057   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2058   IS             iscol=a->col,isrow=a->row;
2059   PetscErrorCode ierr;
2060   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2061   PetscInt       i,n=a->mbs;
2062   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
2063   MatScalar      *aa=a->a,*v;
2064   PetscScalar    *x,*b,*s,*t,*ls;
2065 
2066   PetscFunctionBegin;
2067   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2068   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2069   t  = a->solve_work;
2070 
2071   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2072   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2073 
2074   /* forward solve the lower triangular */
2075   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2076   for (i=1; i<n; i++) {
2077     v   = aa + bs2*ai[i];
2078     vi  = aj + ai[i];
2079     nz  = a->diag[i] - ai[i];
2080     s = t + bs*i;
2081     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2082     while (nz--) {
2083       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2084       v += bs2;
2085     }
2086   }
2087   /* backward solve the upper triangular */
2088   ls = a->solve_work + A->cmap->n;
2089   for (i=n-1; i>=0; i--){
2090     v   = aa + bs2*(a->diag[i] + 1);
2091     vi  = aj + a->diag[i] + 1;
2092     nz  = ai[i+1] - a->diag[i] - 1;
2093     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2094     while (nz--) {
2095       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2096       v += bs2;
2097     }
2098     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2099     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2100   }
2101 
2102   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2103   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2104   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2105   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2106   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 /* ----------------------------------------------------------- */
2111 #undef __FUNCT__
2112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2113 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2114 {
2115   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2116   IS                iscol=a->col,isrow=a->row;
2117   PetscErrorCode    ierr;
2118   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2119   PetscInt          i,n=a->mbs,j;
2120   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2121   const MatScalar   *aa=a->a,*v;
2122   PetscScalar       *x,*t,*ls;
2123   const PetscScalar *b;
2124   PetscFunctionBegin;
2125   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2126   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2127   t    = a->solve_work;
2128 
2129   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2130   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2131 
2132   /* copy the b into temp work space according to permutation */
2133   for (i=0; i<n; i++) {
2134     for (j=0; j<bs; j++) {
2135       t[i*bs+j] = b[c[i]*bs+j];
2136     }
2137   }
2138 
2139 
2140   /* forward solve the upper triangular transpose */
2141   ls = a->solve_work + A->cmap->n;
2142   for (i=0; i<n; i++){
2143     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2144     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2145     v   = aa + bs2*(a->diag[i] + 1);
2146     vi  = aj + a->diag[i] + 1;
2147     nz  = ai[i+1] - a->diag[i] - 1;
2148     while (nz--) {
2149       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2150       v += bs2;
2151     }
2152   }
2153 
2154   /* backward solve the lower triangular transpose */
2155   for (i=n-1; i>=0; i--) {
2156     v   = aa + bs2*ai[i];
2157     vi  = aj + ai[i];
2158     nz  = a->diag[i] - ai[i];
2159     while (nz--) {
2160       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2161       v += bs2;
2162     }
2163   }
2164 
2165   /* copy t into x according to permutation */
2166   for (i=0; i<n; i++) {
2167     for (j=0; j<bs; j++) {
2168       x[bs*r[i]+j]   = t[bs*i+j];
2169     }
2170   }
2171 
2172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2174   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2176   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2177   PetscFunctionReturn(0);
2178 }
2179 
2180 #undef __FUNCT__
2181 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2182 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2183 {
2184   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2185   IS                iscol=a->col,isrow=a->row;
2186   PetscErrorCode    ierr;
2187   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2188   PetscInt          i,n=a->mbs,j;
2189   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2190   const MatScalar   *aa=a->a,*v;
2191   PetscScalar       *x,*t,*ls;
2192   const PetscScalar *b;
2193   PetscFunctionBegin;
2194   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2195   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2196   t    = a->solve_work;
2197 
2198   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2199   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2200 
2201   /* copy the b into temp work space according to permutation */
2202   for (i=0; i<n; i++) {
2203     for (j=0; j<bs; j++) {
2204       t[i*bs+j] = b[c[i]*bs+j];
2205     }
2206   }
2207 
2208 
2209   /* forward solve the upper triangular transpose */
2210   ls = a->solve_work + A->cmap->n;
2211   for (i=0; i<n; i++){
2212     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2213     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2214     v   = aa + bs2*(diag[i] - 1);
2215     vi  = aj + diag[i] - 1;
2216     nz  = diag[i] - diag[i+1] - 1;
2217     for(j=0;j>-nz;j--){
2218       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2219       v -= bs2;
2220     }
2221   }
2222 
2223   /* backward solve the lower triangular transpose */
2224   for (i=n-1; i>=0; i--) {
2225     v   = aa + bs2*ai[i];
2226     vi  = aj + ai[i];
2227     nz  = ai[i+1] - ai[i];
2228     for(j=0;j<nz;j++){
2229       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2230       v += bs2;
2231     }
2232   }
2233 
2234   /* copy t into x according to permutation */
2235   for (i=0; i<n; i++) {
2236     for (j=0; j<bs; j++) {
2237       x[bs*r[i]+j]   = t[bs*i+j];
2238     }
2239   }
2240 
2241   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2242   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2243   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2245   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2246   PetscFunctionReturn(0);
2247 }
2248 
2249 /* bs = 15 for PFLOTRAN */
2250 
2251 #undef __FUNCT__
2252 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering"
2253 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx)
2254 {
2255   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2256   PetscErrorCode    ierr;
2257   const PetscInt    *ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2258   PetscInt          i,n=a->mbs,nz,idx,idt,idc,m;
2259   const MatScalar   *aa=a->a,*v;
2260   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2261   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2262   PetscScalar       *x,*t;
2263   const PetscScalar *b;
2264 
2265   PetscFunctionBegin;
2266   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2267   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2268   t  = a->solve_work;
2269 
2270   /* forward solve the lower triangular */
2271   idx    = 0;
2272   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2273   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2274   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2275 
2276   for (i=1; i<n; i++) {
2277     v     = aa + bs2*ai[i];
2278     vi    = aj + ai[i];
2279     nz    = ai[i+1] - ai[i];
2280     idx   = bs*i;
2281     s1   = b[idx];    s2  = b[1+idx];  s3  = b[2+idx];  s4  = b[3+idx];  s5  = b[4+idx];
2282     s6   = b[5+idx];  s7  = b[6+idx];  s8  = b[7+idx];  s9  = b[8+idx];  s10 = b[9+idx];
2283     s11  = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx];
2284     for(m=0;m<nz;m++){
2285       idx   = bs*vi[m];
2286       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2287       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2288       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2289 
2290       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2291       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2292       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2293       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2294       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2295       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2296       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2297       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2298       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2299       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2300       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2301       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2302       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2303       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2304       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2305 
2306       v += bs2;
2307     }
2308     idx = bs*i;
2309     t[idx]    = s1;  t[1+idx]  = s2;  t[2+idx]  = s3;  t[3+idx]  = s4;  t[4+idx]  = s5;
2310     t[5+idx]  = s6;  t[6+idx]  = s7;  t[7+idx]  = s8;  t[8+idx]  = s9;  t[9+idx]  = s10;
2311     t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15;
2312 
2313   }
2314   /* backward solve the upper triangular */
2315   for (i=n-1; i>=0; i--){
2316     v    = aa + bs2*(adiag[i+1]+1);
2317     vi   = aj + adiag[i+1]+1;
2318     nz   = adiag[i] - adiag[i+1] - 1;
2319     idt  = bs*i;
2320     s1   = t[idt];     s2  = t[1+idt];  s3  = t[2+idt];  s4  = t[3+idt];  s5  = t[4+idt];
2321     s6   = t[5+idt];   s7  = t[6+idt];  s8  = t[7+idt];  s9  = t[8+idt];  s10 = t[9+idt];
2322     s11  = t[10+idt]; s12  = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt];
2323 
2324     for(m=0;m<nz;m++){
2325       idx   = bs*vi[m];
2326       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2327       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2328       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2329 
2330       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2331       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2332       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2333       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2334       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2335       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2336       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2337       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2338       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2339       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2340       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2341       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2342       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2343       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2344       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2345 
2346       v += bs2;
2347     }
2348     idc = bs*i;
2349 
2350     x[idc]    = t[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2351     x[1+idc]  = t[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2352     x[2+idc]  = t[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2353     x[3+idc]  = t[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2354     x[4+idc]  = t[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2355     x[5+idc]  = t[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2356     x[6+idc]  = t[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2357     x[7+idc]  = t[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2358     x[8+idc]  = t[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2359     x[9+idc]  = t[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2360     x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2361     x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2362     x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2363     x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2364     x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2365 
2366   }
2367 
2368   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2369   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2370   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2371   PetscFunctionReturn(0);
2372 }
2373 
2374 #undef __FUNCT__
2375 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2376 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2377 {
2378   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2379   IS             iscol=a->col,isrow=a->row;
2380   PetscErrorCode ierr;
2381   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
2382   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
2383   MatScalar      *aa=a->a,*v;
2384   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2385   PetscScalar    *x,*b,*t;
2386 
2387   PetscFunctionBegin;
2388   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2389   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2390   t  = a->solve_work;
2391 
2392   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2393   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2394 
2395   /* forward solve the lower triangular */
2396   idx    = 7*(*r++);
2397   t[0] = b[idx];   t[1] = b[1+idx];
2398   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2399   t[5] = b[5+idx]; t[6] = b[6+idx];
2400 
2401   for (i=1; i<n; i++) {
2402     v     = aa + 49*ai[i];
2403     vi    = aj + ai[i];
2404     nz    = diag[i] - ai[i];
2405     idx   = 7*(*r++);
2406     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2407     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2408     while (nz--) {
2409       idx   = 7*(*vi++);
2410       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2411       x4    = t[3+idx];x5 = t[4+idx];
2412       x6    = t[5+idx];x7 = t[6+idx];
2413       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2414       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2415       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2416       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2417       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2418       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2419       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2420       v += 49;
2421     }
2422     idx = 7*i;
2423     t[idx]   = s1;t[1+idx] = s2;
2424     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2425     t[5+idx] = s6;t[6+idx] = s7;
2426   }
2427   /* backward solve the upper triangular */
2428   for (i=n-1; i>=0; i--){
2429     v    = aa + 49*diag[i] + 49;
2430     vi   = aj + diag[i] + 1;
2431     nz   = ai[i+1] - diag[i] - 1;
2432     idt  = 7*i;
2433     s1 = t[idt];  s2 = t[1+idt];
2434     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2435     s6 = t[5+idt];s7 = t[6+idt];
2436     while (nz--) {
2437       idx   = 7*(*vi++);
2438       x1    = t[idx];   x2 = t[1+idx];
2439       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2440       x6    = t[5+idx]; x7 = t[6+idx];
2441       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2442       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2443       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2444       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2445       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2446       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2447       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2448       v += 49;
2449     }
2450     idc = 7*(*c--);
2451     v   = aa + 49*diag[i];
2452     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2453                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2454     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2455                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2456     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2457                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2458     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2459                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2460     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2461                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2462     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2463                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2464     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2465                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2466   }
2467 
2468   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2469   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2470   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2471   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2472   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2473   PetscFunctionReturn(0);
2474 }
2475 
2476 #undef __FUNCT__
2477 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2478 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2479 {
2480   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2481   IS             iscol=a->col,isrow=a->row;
2482   PetscErrorCode ierr;
2483   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
2484   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2485   MatScalar      *aa=a->a,*v;
2486   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2487   PetscScalar    *x,*b,*t;
2488 
2489   PetscFunctionBegin;
2490   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2491   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2492   t  = a->solve_work;
2493 
2494   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2495   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2496 
2497   /* forward solve the lower triangular */
2498   idx    = 7*r[0];
2499   t[0] = b[idx];   t[1] = b[1+idx];
2500   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2501   t[5] = b[5+idx]; t[6] = b[6+idx];
2502 
2503   for (i=1; i<n; i++) {
2504     v     = aa + 49*ai[i];
2505     vi    = aj + ai[i];
2506     nz    = ai[i+1] - ai[i];
2507     idx   = 7*r[i];
2508     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2509     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2510     for(m=0;m<nz;m++){
2511       idx   = 7*vi[m];
2512       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2513       x4    = t[3+idx];x5 = t[4+idx];
2514       x6    = t[5+idx];x7 = t[6+idx];
2515       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2516       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2517       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2518       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2519       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2520       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2521       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2522       v += 49;
2523     }
2524     idx = 7*i;
2525     t[idx]   = s1;t[1+idx] = s2;
2526     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2527     t[5+idx] = s6;t[6+idx] = s7;
2528   }
2529   /* backward solve the upper triangular */
2530   for (i=n-1; i>=0; i--){
2531     v    = aa + 49*(adiag[i+1]+1);
2532     vi   = aj + adiag[i+1]+1;
2533     nz   = adiag[i] - adiag[i+1] - 1;
2534     idt  = 7*i;
2535     s1 = t[idt];  s2 = t[1+idt];
2536     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2537     s6 = t[5+idt];s7 = t[6+idt];
2538     for(m=0;m<nz;m++){
2539       idx   = 7*vi[m];
2540       x1    = t[idx];   x2 = t[1+idx];
2541       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2542       x6    = t[5+idx]; x7 = t[6+idx];
2543       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2544       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2545       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2546       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2547       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2548       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2549       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2550       v += 49;
2551     }
2552     idc = 7*c[i];
2553     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2554                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2555     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2556                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2557     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2558                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2559     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2560                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2561     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2562                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2563     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2564                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2565     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2566                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2567   }
2568 
2569   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2570   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2571   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2572   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2573   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2574   PetscFunctionReturn(0);
2575 }
2576 
2577 #undef __FUNCT__
2578 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2579 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2580 {
2581   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2582   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2583   PetscErrorCode    ierr;
2584   PetscInt          *diag = a->diag,jdx;
2585   const MatScalar   *aa=a->a,*v;
2586   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2587   const PetscScalar *b;
2588 
2589   PetscFunctionBegin;
2590   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2592   /* forward solve the lower triangular */
2593   idx    = 0;
2594   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2595   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2596   x[6] = b[6+idx];
2597   for (i=1; i<n; i++) {
2598     v     =  aa + 49*ai[i];
2599     vi    =  aj + ai[i];
2600     nz    =  diag[i] - ai[i];
2601     idx   =  7*i;
2602     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2603     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2604     s7  =  b[6+idx];
2605     while (nz--) {
2606       jdx   = 7*(*vi++);
2607       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2608       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2609       x7    = x[6+jdx];
2610       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2611       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2612       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2613       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2614       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2615       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2616       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2617       v += 49;
2618      }
2619     x[idx]   = s1;
2620     x[1+idx] = s2;
2621     x[2+idx] = s3;
2622     x[3+idx] = s4;
2623     x[4+idx] = s5;
2624     x[5+idx] = s6;
2625     x[6+idx] = s7;
2626   }
2627   /* backward solve the upper triangular */
2628   for (i=n-1; i>=0; i--){
2629     v    = aa + 49*diag[i] + 49;
2630     vi   = aj + diag[i] + 1;
2631     nz   = ai[i+1] - diag[i] - 1;
2632     idt  = 7*i;
2633     s1 = x[idt];   s2 = x[1+idt];
2634     s3 = x[2+idt]; s4 = x[3+idt];
2635     s5 = x[4+idt]; s6 = x[5+idt];
2636     s7 = x[6+idt];
2637     while (nz--) {
2638       idx   = 7*(*vi++);
2639       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2640       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2641       x7    = x[6+idx];
2642       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2643       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2644       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2645       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2646       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2647       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2648       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2649       v += 49;
2650     }
2651     v        = aa + 49*diag[i];
2652     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2653                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2654     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2655                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2656     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2657                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2658     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2659                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2660     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2661                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2662     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2663                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2664     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2665                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2666   }
2667 
2668   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2669   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2670   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2671   PetscFunctionReturn(0);
2672 }
2673 
2674 #undef __FUNCT__
2675 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2676 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2677 {
2678     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2679     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2680     PetscErrorCode    ierr;
2681     PetscInt          idx,jdx,idt;
2682     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2683     const MatScalar   *aa=a->a,*v;
2684     PetscScalar       *x;
2685     const PetscScalar *b;
2686     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2687 
2688     PetscFunctionBegin;
2689     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2690     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2691     /* forward solve the lower triangular */
2692     idx    = 0;
2693     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2694     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2695     for (i=1; i<n; i++) {
2696        v    = aa + bs2*ai[i];
2697        vi   = aj + ai[i];
2698        nz   = ai[i+1] - ai[i];
2699       idx   = bs*i;
2700        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2701        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2702        for(k=0;k<nz;k++) {
2703           jdx   = bs*vi[k];
2704           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2705 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2706           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2707           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2708           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2709 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2710           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2711 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2712 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2713           v   +=  bs2;
2714         }
2715 
2716        x[idx]   = s1;
2717        x[1+idx] = s2;
2718        x[2+idx] = s3;
2719        x[3+idx] = s4;
2720        x[4+idx] = s5;
2721        x[5+idx] = s6;
2722        x[6+idx] = s7;
2723     }
2724 
2725    /* backward solve the upper triangular */
2726   for (i=n-1; i>=0; i--){
2727     v   = aa + bs2*(adiag[i+1]+1);
2728      vi  = aj + adiag[i+1]+1;
2729      nz  = adiag[i] - adiag[i+1]-1;
2730      idt = bs*i;
2731      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2732      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2733     for(k=0;k<nz;k++) {
2734       idx   = bs*vi[k];
2735        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2736        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2737        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2738        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2739        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2740        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2741        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2742        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2743        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2744         v   +=  bs2;
2745     }
2746     /* x = inv_diagonal*x */
2747     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2748     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2749     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2750     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2751     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2752     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2753     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2754   }
2755 
2756   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2757   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2758   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2759   PetscFunctionReturn(0);
2760 }
2761 
2762 #undef __FUNCT__
2763 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2764 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2765 {
2766   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2767   IS                iscol=a->col,isrow=a->row;
2768   PetscErrorCode    ierr;
2769   const PetscInt    *r,*c,*rout,*cout;
2770   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2771   const MatScalar   *aa=a->a,*v;
2772   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2773   const PetscScalar *b;
2774   PetscFunctionBegin;
2775   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2776   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2777   t  = a->solve_work;
2778 
2779   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2780   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2781 
2782   /* forward solve the lower triangular */
2783   idx    = 6*(*r++);
2784   t[0] = b[idx];   t[1] = b[1+idx];
2785   t[2] = b[2+idx]; t[3] = b[3+idx];
2786   t[4] = b[4+idx]; t[5] = b[5+idx];
2787   for (i=1; i<n; i++) {
2788     v     = aa + 36*ai[i];
2789     vi    = aj + ai[i];
2790     nz    = diag[i] - ai[i];
2791     idx   = 6*(*r++);
2792     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2793     s5  = b[4+idx]; s6 = b[5+idx];
2794     while (nz--) {
2795       idx   = 6*(*vi++);
2796       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2797       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2798       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2799       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2800       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2801       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2802       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2803       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2804       v += 36;
2805     }
2806     idx = 6*i;
2807     t[idx]   = s1;t[1+idx] = s2;
2808     t[2+idx] = s3;t[3+idx] = s4;
2809     t[4+idx] = s5;t[5+idx] = s6;
2810   }
2811   /* backward solve the upper triangular */
2812   for (i=n-1; i>=0; i--){
2813     v    = aa + 36*diag[i] + 36;
2814     vi   = aj + diag[i] + 1;
2815     nz   = ai[i+1] - diag[i] - 1;
2816     idt  = 6*i;
2817     s1 = t[idt];  s2 = t[1+idt];
2818     s3 = t[2+idt];s4 = t[3+idt];
2819     s5 = t[4+idt];s6 = t[5+idt];
2820     while (nz--) {
2821       idx   = 6*(*vi++);
2822       x1    = t[idx];   x2 = t[1+idx];
2823       x3    = t[2+idx]; x4 = t[3+idx];
2824       x5    = t[4+idx]; x6 = t[5+idx];
2825       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2826       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2827       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2828       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2829       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2830       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2831       v += 36;
2832     }
2833     idc = 6*(*c--);
2834     v   = aa + 36*diag[i];
2835     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2836                                  v[18]*s4+v[24]*s5+v[30]*s6;
2837     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2838                                  v[19]*s4+v[25]*s5+v[31]*s6;
2839     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2840                                  v[20]*s4+v[26]*s5+v[32]*s6;
2841     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2842                                  v[21]*s4+v[27]*s5+v[33]*s6;
2843     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2844                                  v[22]*s4+v[28]*s5+v[34]*s6;
2845     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2846                                  v[23]*s4+v[29]*s5+v[35]*s6;
2847   }
2848 
2849   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2850   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2851   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2852   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2853   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2854   PetscFunctionReturn(0);
2855 }
2856 
2857 #undef __FUNCT__
2858 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2859 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2860 {
2861   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2862   IS                iscol=a->col,isrow=a->row;
2863   PetscErrorCode    ierr;
2864   const PetscInt    *r,*c,*rout,*cout;
2865   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2866   const MatScalar   *aa=a->a,*v;
2867   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2868   const PetscScalar *b;
2869   PetscFunctionBegin;
2870   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2871   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2872   t  = a->solve_work;
2873 
2874   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2875   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2876 
2877   /* forward solve the lower triangular */
2878   idx    = 6*r[0];
2879   t[0] = b[idx];   t[1] = b[1+idx];
2880   t[2] = b[2+idx]; t[3] = b[3+idx];
2881   t[4] = b[4+idx]; t[5] = b[5+idx];
2882   for (i=1; i<n; i++) {
2883     v     = aa + 36*ai[i];
2884     vi    = aj + ai[i];
2885     nz    = ai[i+1] - ai[i];
2886     idx   = 6*r[i];
2887     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2888     s5  = b[4+idx]; s6 = b[5+idx];
2889     for(m=0;m<nz;m++){
2890       idx   = 6*vi[m];
2891       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2892       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2893       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2894       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2895       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2896       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2897       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2898       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2899       v += 36;
2900     }
2901     idx = 6*i;
2902     t[idx]   = s1;t[1+idx] = s2;
2903     t[2+idx] = s3;t[3+idx] = s4;
2904     t[4+idx] = s5;t[5+idx] = s6;
2905   }
2906   /* backward solve the upper triangular */
2907   for (i=n-1; i>=0; i--){
2908     v    = aa + 36*(adiag[i+1]+1);
2909     vi   = aj + adiag[i+1]+1;
2910     nz   = adiag[i] - adiag[i+1] - 1;
2911     idt  = 6*i;
2912     s1 = t[idt];  s2 = t[1+idt];
2913     s3 = t[2+idt];s4 = t[3+idt];
2914     s5 = t[4+idt];s6 = t[5+idt];
2915     for(m=0;m<nz;m++){
2916       idx   = 6*vi[m];
2917       x1    = t[idx];   x2 = t[1+idx];
2918       x3    = t[2+idx]; x4 = t[3+idx];
2919       x5    = t[4+idx]; x6 = t[5+idx];
2920       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2921       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2922       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2923       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2924       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2925       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2926       v += 36;
2927     }
2928     idc = 6*c[i];
2929     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2930                                  v[18]*s4+v[24]*s5+v[30]*s6;
2931     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2932                                  v[19]*s4+v[25]*s5+v[31]*s6;
2933     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2934                                  v[20]*s4+v[26]*s5+v[32]*s6;
2935     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2936                                  v[21]*s4+v[27]*s5+v[33]*s6;
2937     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2938                                  v[22]*s4+v[28]*s5+v[34]*s6;
2939     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2940                                  v[23]*s4+v[29]*s5+v[35]*s6;
2941   }
2942 
2943   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2944   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2945   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2946   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2947   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2948   PetscFunctionReturn(0);
2949 }
2950 
2951 #undef __FUNCT__
2952 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
2953 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2954 {
2955   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2956   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2957   PetscErrorCode    ierr;
2958   PetscInt          *diag = a->diag,jdx;
2959   const MatScalar   *aa=a->a,*v;
2960   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2961   const PetscScalar *b;
2962 
2963   PetscFunctionBegin;
2964   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2965   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2966   /* forward solve the lower triangular */
2967   idx    = 0;
2968   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2969   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2970   for (i=1; i<n; i++) {
2971     v     =  aa + 36*ai[i];
2972     vi    =  aj + ai[i];
2973     nz    =  diag[i] - ai[i];
2974     idx   =  6*i;
2975     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2976     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2977     while (nz--) {
2978       jdx   = 6*(*vi++);
2979       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2980       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2981       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2982       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2983       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2984       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2985       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2986       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2987       v += 36;
2988      }
2989     x[idx]   = s1;
2990     x[1+idx] = s2;
2991     x[2+idx] = s3;
2992     x[3+idx] = s4;
2993     x[4+idx] = s5;
2994     x[5+idx] = s6;
2995   }
2996   /* backward solve the upper triangular */
2997   for (i=n-1; i>=0; i--){
2998     v    = aa + 36*diag[i] + 36;
2999     vi   = aj + diag[i] + 1;
3000     nz   = ai[i+1] - diag[i] - 1;
3001     idt  = 6*i;
3002     s1 = x[idt];   s2 = x[1+idt];
3003     s3 = x[2+idt]; s4 = x[3+idt];
3004     s5 = x[4+idt]; s6 = x[5+idt];
3005     while (nz--) {
3006       idx   = 6*(*vi++);
3007       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3008       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3009       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3010       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3011       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3012       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3013       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3014       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3015       v += 36;
3016     }
3017     v        = aa + 36*diag[i];
3018     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3019     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3020     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3021     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3022     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3023     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3024   }
3025 
3026   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3027   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3028   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3029   PetscFunctionReturn(0);
3030 }
3031 
3032 #undef __FUNCT__
3033 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3034 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3035 {
3036     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3037     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3038     PetscErrorCode    ierr;
3039     PetscInt          idx,jdx,idt;
3040     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3041     const MatScalar   *aa=a->a,*v;
3042     PetscScalar       *x;
3043     const PetscScalar *b;
3044     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3045 
3046     PetscFunctionBegin;
3047     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3048     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3049     /* forward solve the lower triangular */
3050     idx    = 0;
3051     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3052     x[4] = b[4+idx];x[5] = b[5+idx];
3053     for (i=1; i<n; i++) {
3054        v    = aa + bs2*ai[i];
3055        vi   = aj + ai[i];
3056        nz   = ai[i+1] - ai[i];
3057       idx   = bs*i;
3058        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3059        s5   = b[4+idx];s6 = b[5+idx];
3060        for(k=0;k<nz;k++){
3061           jdx   = bs*vi[k];
3062           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3063 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3064           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3065           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3066           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3067 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3068           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3069 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3070           v   +=  bs2;
3071         }
3072 
3073        x[idx]   = s1;
3074        x[1+idx] = s2;
3075        x[2+idx] = s3;
3076        x[3+idx] = s4;
3077        x[4+idx] = s5;
3078        x[5+idx] = s6;
3079     }
3080 
3081    /* backward solve the upper triangular */
3082   for (i=n-1; i>=0; i--){
3083     v   = aa + bs2*(adiag[i+1]+1);
3084      vi  = aj + adiag[i+1]+1;
3085      nz  = adiag[i] - adiag[i+1]-1;
3086      idt = bs*i;
3087      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3088      s5 = x[4+idt];s6 = x[5+idt];
3089      for(k=0;k<nz;k++){
3090       idx   = bs*vi[k];
3091        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3092        x5    = x[4+idx];x6 = x[5+idx];
3093        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3094        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3095        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3096        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3097        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3098        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3099         v   +=  bs2;
3100     }
3101     /* x = inv_diagonal*x */
3102    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3103    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3104    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3105    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3106    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3107    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3108   }
3109 
3110   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3111   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3112   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3113   PetscFunctionReturn(0);
3114 }
3115 
3116 #undef __FUNCT__
3117 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3118 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3119 {
3120   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3121   IS                iscol=a->col,isrow=a->row;
3122   PetscErrorCode    ierr;
3123   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3124   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3125   const MatScalar   *aa=a->a,*v;
3126   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3127   const PetscScalar *b;
3128 
3129   PetscFunctionBegin;
3130   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3131   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3132   t  = a->solve_work;
3133 
3134   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3135   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3136 
3137   /* forward solve the lower triangular */
3138   idx    = 5*(*r++);
3139   t[0] = b[idx];   t[1] = b[1+idx];
3140   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3141   for (i=1; i<n; i++) {
3142     v     = aa + 25*ai[i];
3143     vi    = aj + ai[i];
3144     nz    = diag[i] - ai[i];
3145     idx   = 5*(*r++);
3146     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3147     s5  = b[4+idx];
3148     while (nz--) {
3149       idx   = 5*(*vi++);
3150       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3151       x4    = t[3+idx];x5 = t[4+idx];
3152       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3153       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3154       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3155       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3156       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3157       v += 25;
3158     }
3159     idx = 5*i;
3160     t[idx]   = s1;t[1+idx] = s2;
3161     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3162   }
3163   /* backward solve the upper triangular */
3164   for (i=n-1; i>=0; i--){
3165     v    = aa + 25*diag[i] + 25;
3166     vi   = aj + diag[i] + 1;
3167     nz   = ai[i+1] - diag[i] - 1;
3168     idt  = 5*i;
3169     s1 = t[idt];  s2 = t[1+idt];
3170     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3171     while (nz--) {
3172       idx   = 5*(*vi++);
3173       x1    = t[idx];   x2 = t[1+idx];
3174       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3175       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3176       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3177       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3178       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3179       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3180       v += 25;
3181     }
3182     idc = 5*(*c--);
3183     v   = aa + 25*diag[i];
3184     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3185                                  v[15]*s4+v[20]*s5;
3186     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3187                                  v[16]*s4+v[21]*s5;
3188     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3189                                  v[17]*s4+v[22]*s5;
3190     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3191                                  v[18]*s4+v[23]*s5;
3192     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3193                                  v[19]*s4+v[24]*s5;
3194   }
3195 
3196   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3197   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3198   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3199   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3200   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3201   PetscFunctionReturn(0);
3202 }
3203 
3204 #undef __FUNCT__
3205 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3206 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3207 {
3208   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3209   IS                iscol=a->col,isrow=a->row;
3210   PetscErrorCode    ierr;
3211   const PetscInt    *r,*c,*rout,*cout;
3212   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3213   const MatScalar   *aa=a->a,*v;
3214   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3215   const PetscScalar *b;
3216 
3217   PetscFunctionBegin;
3218   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3219   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3220   t  = a->solve_work;
3221 
3222   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3223   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3224 
3225   /* forward solve the lower triangular */
3226   idx    = 5*r[0];
3227   t[0] = b[idx];   t[1] = b[1+idx];
3228   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3229   for (i=1; i<n; i++) {
3230     v     = aa + 25*ai[i];
3231     vi    = aj + ai[i];
3232     nz    = ai[i+1] - ai[i];
3233     idx   = 5*r[i];
3234     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3235     s5  = b[4+idx];
3236     for(m=0;m<nz;m++){
3237       idx   = 5*vi[m];
3238       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3239       x4    = t[3+idx];x5 = t[4+idx];
3240       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3241       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3242       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3243       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3244       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3245       v += 25;
3246     }
3247     idx = 5*i;
3248     t[idx]   = s1;t[1+idx] = s2;
3249     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3250   }
3251   /* backward solve the upper triangular */
3252   for (i=n-1; i>=0; i--){
3253     v    = aa + 25*(adiag[i+1]+1);
3254     vi   = aj + adiag[i+1]+1;
3255     nz   = adiag[i] - adiag[i+1] - 1;
3256     idt  = 5*i;
3257     s1 = t[idt];  s2 = t[1+idt];
3258     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3259     for(m=0;m<nz;m++){
3260       idx   = 5*vi[m];
3261       x1    = t[idx];   x2 = t[1+idx];
3262       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3263       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3264       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3265       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3266       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3267       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3268       v += 25;
3269     }
3270     idc = 5*c[i];
3271     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3272                                  v[15]*s4+v[20]*s5;
3273     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3274                                  v[16]*s4+v[21]*s5;
3275     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3276                                  v[17]*s4+v[22]*s5;
3277     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3278                                  v[18]*s4+v[23]*s5;
3279     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3280                                  v[19]*s4+v[24]*s5;
3281   }
3282 
3283   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3284   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3285   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3286   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3287   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3288   PetscFunctionReturn(0);
3289 }
3290 
3291 #undef __FUNCT__
3292 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3293 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3294 {
3295   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3296   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3297   PetscErrorCode    ierr;
3298   PetscInt          *diag = a->diag,jdx;
3299   const MatScalar   *aa=a->a,*v;
3300   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3301   const PetscScalar *b;
3302 
3303   PetscFunctionBegin;
3304   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3305   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3306   /* forward solve the lower triangular */
3307   idx    = 0;
3308   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3309   for (i=1; i<n; i++) {
3310     v     =  aa + 25*ai[i];
3311     vi    =  aj + ai[i];
3312     nz    =  diag[i] - ai[i];
3313     idx   =  5*i;
3314     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3315     while (nz--) {
3316       jdx   = 5*(*vi++);
3317       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3318       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3319       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3320       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3321       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3322       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3323       v    += 25;
3324     }
3325     x[idx]   = s1;
3326     x[1+idx] = s2;
3327     x[2+idx] = s3;
3328     x[3+idx] = s4;
3329     x[4+idx] = s5;
3330   }
3331   /* backward solve the upper triangular */
3332   for (i=n-1; i>=0; i--){
3333     v    = aa + 25*diag[i] + 25;
3334     vi   = aj + diag[i] + 1;
3335     nz   = ai[i+1] - diag[i] - 1;
3336     idt  = 5*i;
3337     s1 = x[idt];  s2 = x[1+idt];
3338     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3339     while (nz--) {
3340       idx   = 5*(*vi++);
3341       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3342       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3343       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3344       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3345       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3346       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3347       v    += 25;
3348     }
3349     v        = aa + 25*diag[i];
3350     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3351     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3352     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3353     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3354     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3355   }
3356 
3357   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3358   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3359   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 #undef __FUNCT__
3364 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3365 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3366 {
3367   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3368   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3369   PetscErrorCode    ierr;
3370   PetscInt          jdx;
3371   const MatScalar   *aa=a->a,*v;
3372   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3373   const PetscScalar *b;
3374 
3375   PetscFunctionBegin;
3376   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3377   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3378   /* forward solve the lower triangular */
3379   idx    = 0;
3380   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3381   for (i=1; i<n; i++) {
3382     v   = aa + 25*ai[i];
3383     vi  = aj + ai[i];
3384     nz  = ai[i+1] - ai[i];
3385     idx = 5*i;
3386     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3387     for(k=0;k<nz;k++) {
3388       jdx   = 5*vi[k];
3389       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3390       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3391       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3392       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3393       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3394       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3395       v    += 25;
3396     }
3397     x[idx]   = s1;
3398     x[1+idx] = s2;
3399     x[2+idx] = s3;
3400     x[3+idx] = s4;
3401     x[4+idx] = s5;
3402   }
3403 
3404   /* backward solve the upper triangular */
3405   for (i=n-1; i>=0; i--){
3406     v   = aa + 25*(adiag[i+1]+1);
3407     vi  = aj + adiag[i+1]+1;
3408     nz  = adiag[i] - adiag[i+1]-1;
3409     idt = 5*i;
3410     s1 = x[idt];  s2 = x[1+idt];
3411     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3412     for(k=0;k<nz;k++){
3413       idx   = 5*vi[k];
3414       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3415       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3416       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3417       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3418       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3419       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3420       v    += 25;
3421     }
3422     /* x = inv_diagonal*x */
3423     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3424     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3425     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3426     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3427     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3428   }
3429 
3430   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3431   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3432   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3433   PetscFunctionReturn(0);
3434 }
3435 
3436 #undef __FUNCT__
3437 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3438 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3439 {
3440   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3441   IS                iscol=a->col,isrow=a->row;
3442   PetscErrorCode    ierr;
3443   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3444   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3445   const MatScalar   *aa=a->a,*v;
3446   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3447   const PetscScalar *b;
3448 
3449   PetscFunctionBegin;
3450   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3451   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3452   t  = a->solve_work;
3453 
3454   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3455   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3456 
3457   /* forward solve the lower triangular */
3458   idx    = 4*(*r++);
3459   t[0] = b[idx];   t[1] = b[1+idx];
3460   t[2] = b[2+idx]; t[3] = b[3+idx];
3461   for (i=1; i<n; i++) {
3462     v     = aa + 16*ai[i];
3463     vi    = aj + ai[i];
3464     nz    = diag[i] - ai[i];
3465     idx   = 4*(*r++);
3466     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3467     while (nz--) {
3468       idx   = 4*(*vi++);
3469       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3470       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3471       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3472       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3473       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3474       v    += 16;
3475     }
3476     idx        = 4*i;
3477     t[idx]   = s1;t[1+idx] = s2;
3478     t[2+idx] = s3;t[3+idx] = s4;
3479   }
3480   /* backward solve the upper triangular */
3481   for (i=n-1; i>=0; i--){
3482     v    = aa + 16*diag[i] + 16;
3483     vi   = aj + diag[i] + 1;
3484     nz   = ai[i+1] - diag[i] - 1;
3485     idt  = 4*i;
3486     s1 = t[idt];  s2 = t[1+idt];
3487     s3 = t[2+idt];s4 = t[3+idt];
3488     while (nz--) {
3489       idx   = 4*(*vi++);
3490       x1    = t[idx];   x2 = t[1+idx];
3491       x3    = t[2+idx]; x4 = t[3+idx];
3492       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3493       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3494       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3495       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3496       v += 16;
3497     }
3498     idc      = 4*(*c--);
3499     v        = aa + 16*diag[i];
3500     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3501     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3502     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3503     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3504   }
3505 
3506   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3507   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3508   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3509   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3510   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3511   PetscFunctionReturn(0);
3512 }
3513 
3514 #undef __FUNCT__
3515 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3516 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3517 {
3518   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3519   IS                iscol=a->col,isrow=a->row;
3520   PetscErrorCode    ierr;
3521   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3522   const PetscInt    *r,*c,*rout,*cout;
3523   const MatScalar   *aa=a->a,*v;
3524   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3525   const PetscScalar *b;
3526 
3527   PetscFunctionBegin;
3528   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3529   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3530   t  = a->solve_work;
3531 
3532   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3533   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3534 
3535   /* forward solve the lower triangular */
3536   idx    = 4*r[0];
3537   t[0] = b[idx];   t[1] = b[1+idx];
3538   t[2] = b[2+idx]; t[3] = b[3+idx];
3539   for (i=1; i<n; i++) {
3540     v     = aa + 16*ai[i];
3541     vi    = aj + ai[i];
3542     nz    = ai[i+1] - ai[i];
3543     idx   = 4*r[i];
3544     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3545     for(m=0;m<nz;m++){
3546       idx   = 4*vi[m];
3547       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3548       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3549       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3550       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3551       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3552       v    += 16;
3553     }
3554     idx        = 4*i;
3555     t[idx]   = s1;t[1+idx] = s2;
3556     t[2+idx] = s3;t[3+idx] = s4;
3557   }
3558   /* backward solve the upper triangular */
3559   for (i=n-1; i>=0; i--){
3560     v    = aa + 16*(adiag[i+1]+1);
3561     vi   = aj + adiag[i+1]+1;
3562     nz   = adiag[i] - adiag[i+1] - 1;
3563     idt  = 4*i;
3564     s1 = t[idt];  s2 = t[1+idt];
3565     s3 = t[2+idt];s4 = t[3+idt];
3566     for(m=0;m<nz;m++){
3567       idx   = 4*vi[m];
3568       x1    = t[idx];   x2 = t[1+idx];
3569       x3    = t[2+idx]; x4 = t[3+idx];
3570       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3571       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3572       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3573       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3574       v += 16;
3575     }
3576     idc      = 4*c[i];
3577     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3578     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3579     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3580     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3581   }
3582 
3583   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3584   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3585   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3586   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3587   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3588   PetscFunctionReturn(0);
3589 }
3590 
3591 #undef __FUNCT__
3592 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3593 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3594 {
3595   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3596   IS                iscol=a->col,isrow=a->row;
3597   PetscErrorCode    ierr;
3598   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3599   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3600   const MatScalar   *aa=a->a,*v;
3601   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3602   PetscScalar       *x;
3603   const PetscScalar *b;
3604 
3605   PetscFunctionBegin;
3606   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3607   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3608   t  = (MatScalar *)a->solve_work;
3609 
3610   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3611   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3612 
3613   /* forward solve the lower triangular */
3614   idx    = 4*(*r++);
3615   t[0] = (MatScalar)b[idx];
3616   t[1] = (MatScalar)b[1+idx];
3617   t[2] = (MatScalar)b[2+idx];
3618   t[3] = (MatScalar)b[3+idx];
3619   for (i=1; i<n; i++) {
3620     v     = aa + 16*ai[i];
3621     vi    = aj + ai[i];
3622     nz    = diag[i] - ai[i];
3623     idx   = 4*(*r++);
3624     s1 = (MatScalar)b[idx];
3625     s2 = (MatScalar)b[1+idx];
3626     s3 = (MatScalar)b[2+idx];
3627     s4 = (MatScalar)b[3+idx];
3628     while (nz--) {
3629       idx   = 4*(*vi++);
3630       x1  = t[idx];
3631       x2  = t[1+idx];
3632       x3  = t[2+idx];
3633       x4  = t[3+idx];
3634       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3635       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3636       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3637       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3638       v    += 16;
3639     }
3640     idx        = 4*i;
3641     t[idx]   = s1;
3642     t[1+idx] = s2;
3643     t[2+idx] = s3;
3644     t[3+idx] = s4;
3645   }
3646   /* backward solve the upper triangular */
3647   for (i=n-1; i>=0; i--){
3648     v    = aa + 16*diag[i] + 16;
3649     vi   = aj + diag[i] + 1;
3650     nz   = ai[i+1] - diag[i] - 1;
3651     idt  = 4*i;
3652     s1 = t[idt];
3653     s2 = t[1+idt];
3654     s3 = t[2+idt];
3655     s4 = t[3+idt];
3656     while (nz--) {
3657       idx   = 4*(*vi++);
3658       x1  = t[idx];
3659       x2  = t[1+idx];
3660       x3  = t[2+idx];
3661       x4  = t[3+idx];
3662       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3663       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3664       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3665       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3666       v += 16;
3667     }
3668     idc      = 4*(*c--);
3669     v        = aa + 16*diag[i];
3670     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3671     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3672     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3673     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3674     x[idc]   = (PetscScalar)t[idt];
3675     x[1+idc] = (PetscScalar)t[1+idt];
3676     x[2+idc] = (PetscScalar)t[2+idt];
3677     x[3+idc] = (PetscScalar)t[3+idt];
3678  }
3679 
3680   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3681   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3682   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3683   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3684   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3685   PetscFunctionReturn(0);
3686 }
3687 
3688 #if defined (PETSC_HAVE_SSE)
3689 
3690 #include PETSC_HAVE_SSE
3691 
3692 #undef __FUNCT__
3693 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3694 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3695 {
3696   /*
3697      Note: This code uses demotion of double
3698      to float when performing the mixed-mode computation.
3699      This may not be numerically reasonable for all applications.
3700   */
3701   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3702   IS             iscol=a->col,isrow=a->row;
3703   PetscErrorCode ierr;
3704   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3705   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3706   MatScalar      *aa=a->a,*v;
3707   PetscScalar    *x,*b,*t;
3708 
3709   /* Make space in temp stack for 16 Byte Aligned arrays */
3710   float           ssealignedspace[11],*tmps,*tmpx;
3711   unsigned long   offset;
3712 
3713   PetscFunctionBegin;
3714   SSE_SCOPE_BEGIN;
3715 
3716     offset = (unsigned long)ssealignedspace % 16;
3717     if (offset) offset = (16 - offset)/4;
3718     tmps = &ssealignedspace[offset];
3719     tmpx = &ssealignedspace[offset+4];
3720     PREFETCH_NTA(aa+16*ai[1]);
3721 
3722     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3723     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3724     t  = a->solve_work;
3725 
3726     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3727     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3728 
3729     /* forward solve the lower triangular */
3730     idx  = 4*(*r++);
3731     t[0] = b[idx];   t[1] = b[1+idx];
3732     t[2] = b[2+idx]; t[3] = b[3+idx];
3733     v    =  aa + 16*ai[1];
3734 
3735     for (i=1; i<n;) {
3736       PREFETCH_NTA(&v[8]);
3737       vi   =  aj      + ai[i];
3738       nz   =  diag[i] - ai[i];
3739       idx  =  4*(*r++);
3740 
3741       /* Demote sum from double to float */
3742       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3743       LOAD_PS(tmps,XMM7);
3744 
3745       while (nz--) {
3746         PREFETCH_NTA(&v[16]);
3747         idx = 4*(*vi++);
3748 
3749         /* Demote solution (so far) from double to float */
3750         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3751 
3752         /* 4x4 Matrix-Vector product with negative accumulation: */
3753         SSE_INLINE_BEGIN_2(tmpx,v)
3754           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3755 
3756           /* First Column */
3757           SSE_COPY_PS(XMM0,XMM6)
3758           SSE_SHUFFLE(XMM0,XMM0,0x00)
3759           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3760           SSE_SUB_PS(XMM7,XMM0)
3761 
3762           /* Second Column */
3763           SSE_COPY_PS(XMM1,XMM6)
3764           SSE_SHUFFLE(XMM1,XMM1,0x55)
3765           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3766           SSE_SUB_PS(XMM7,XMM1)
3767 
3768           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3769 
3770           /* Third Column */
3771           SSE_COPY_PS(XMM2,XMM6)
3772           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3773           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3774           SSE_SUB_PS(XMM7,XMM2)
3775 
3776           /* Fourth Column */
3777           SSE_COPY_PS(XMM3,XMM6)
3778           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3779           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3780           SSE_SUB_PS(XMM7,XMM3)
3781         SSE_INLINE_END_2
3782 
3783         v  += 16;
3784       }
3785       idx = 4*i;
3786       v   = aa + 16*ai[++i];
3787       PREFETCH_NTA(v);
3788       STORE_PS(tmps,XMM7);
3789 
3790       /* Promote result from float to double */
3791       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3792     }
3793     /* backward solve the upper triangular */
3794     idt  = 4*(n-1);
3795     ai16 = 16*diag[n-1];
3796     v    = aa + ai16 + 16;
3797     for (i=n-1; i>=0;){
3798       PREFETCH_NTA(&v[8]);
3799       vi = aj + diag[i] + 1;
3800       nz = ai[i+1] - diag[i] - 1;
3801 
3802       /* Demote accumulator from double to float */
3803       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3804       LOAD_PS(tmps,XMM7);
3805 
3806       while (nz--) {
3807         PREFETCH_NTA(&v[16]);
3808         idx = 4*(*vi++);
3809 
3810         /* Demote solution (so far) from double to float */
3811         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3812 
3813         /* 4x4 Matrix-Vector Product with negative accumulation: */
3814         SSE_INLINE_BEGIN_2(tmpx,v)
3815           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3816 
3817           /* First Column */
3818           SSE_COPY_PS(XMM0,XMM6)
3819           SSE_SHUFFLE(XMM0,XMM0,0x00)
3820           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3821           SSE_SUB_PS(XMM7,XMM0)
3822 
3823           /* Second Column */
3824           SSE_COPY_PS(XMM1,XMM6)
3825           SSE_SHUFFLE(XMM1,XMM1,0x55)
3826           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3827           SSE_SUB_PS(XMM7,XMM1)
3828 
3829           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3830 
3831           /* Third Column */
3832           SSE_COPY_PS(XMM2,XMM6)
3833           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3834           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3835           SSE_SUB_PS(XMM7,XMM2)
3836 
3837           /* Fourth Column */
3838           SSE_COPY_PS(XMM3,XMM6)
3839           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3840           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3841           SSE_SUB_PS(XMM7,XMM3)
3842         SSE_INLINE_END_2
3843         v  += 16;
3844       }
3845       v    = aa + ai16;
3846       ai16 = 16*diag[--i];
3847       PREFETCH_NTA(aa+ai16+16);
3848       /*
3849          Scale the result by the diagonal 4x4 block,
3850          which was inverted as part of the factorization
3851       */
3852       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3853         /* First Column */
3854         SSE_COPY_PS(XMM0,XMM7)
3855         SSE_SHUFFLE(XMM0,XMM0,0x00)
3856         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3857 
3858         /* Second Column */
3859         SSE_COPY_PS(XMM1,XMM7)
3860         SSE_SHUFFLE(XMM1,XMM1,0x55)
3861         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3862         SSE_ADD_PS(XMM0,XMM1)
3863 
3864         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3865 
3866         /* Third Column */
3867         SSE_COPY_PS(XMM2,XMM7)
3868         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3869         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3870         SSE_ADD_PS(XMM0,XMM2)
3871 
3872         /* Fourth Column */
3873         SSE_COPY_PS(XMM3,XMM7)
3874         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3875         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3876         SSE_ADD_PS(XMM0,XMM3)
3877 
3878         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3879       SSE_INLINE_END_3
3880 
3881       /* Promote solution from float to double */
3882       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3883 
3884       /* Apply reordering to t and stream into x.    */
3885       /* This way, x doesn't pollute the cache.      */
3886       /* Be careful with size: 2 doubles = 4 floats! */
3887       idc  = 4*(*c--);
3888       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3889         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3890         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3891         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3892         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3893         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3894         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3895       SSE_INLINE_END_2
3896       v    = aa + ai16 + 16;
3897       idt -= 4;
3898     }
3899 
3900     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3901     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3902     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3903     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3904     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3905   SSE_SCOPE_END;
3906   PetscFunctionReturn(0);
3907 }
3908 
3909 #endif
3910 
3911 
3912 /*
3913       Special case where the matrix was ILU(0) factored in the natural
3914    ordering. This eliminates the need for the column and row permutation.
3915 */
3916 #undef __FUNCT__
3917 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
3918 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3919 {
3920   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3921   PetscInt          n=a->mbs;
3922   const PetscInt    *ai=a->i,*aj=a->j;
3923   PetscErrorCode    ierr;
3924   const PetscInt    *diag = a->diag;
3925   const MatScalar   *aa=a->a;
3926   PetscScalar       *x;
3927   const PetscScalar *b;
3928 
3929   PetscFunctionBegin;
3930   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3931   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3932 
3933 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3934   {
3935     static PetscScalar w[2000]; /* very BAD need to fix */
3936     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3937   }
3938 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3939   {
3940     static PetscScalar w[2000]; /* very BAD need to fix */
3941     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3942   }
3943 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3944   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3945 #else
3946   {
3947     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3948     const MatScalar *v;
3949     PetscInt        jdx,idt,idx,nz,i,ai16;
3950     const PetscInt  *vi;
3951 
3952   /* forward solve the lower triangular */
3953   idx    = 0;
3954   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3955   for (i=1; i<n; i++) {
3956     v     =  aa      + 16*ai[i];
3957     vi    =  aj      + ai[i];
3958     nz    =  diag[i] - ai[i];
3959     idx   +=  4;
3960     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3961     while (nz--) {
3962       jdx   = 4*(*vi++);
3963       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3964       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3965       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3966       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3967       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3968       v    += 16;
3969     }
3970     x[idx]   = s1;
3971     x[1+idx] = s2;
3972     x[2+idx] = s3;
3973     x[3+idx] = s4;
3974   }
3975   /* backward solve the upper triangular */
3976   idt = 4*(n-1);
3977   for (i=n-1; i>=0; i--){
3978     ai16 = 16*diag[i];
3979     v    = aa + ai16 + 16;
3980     vi   = aj + diag[i] + 1;
3981     nz   = ai[i+1] - diag[i] - 1;
3982     s1 = x[idt];  s2 = x[1+idt];
3983     s3 = x[2+idt];s4 = x[3+idt];
3984     while (nz--) {
3985       idx   = 4*(*vi++);
3986       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3987       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3988       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3989       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3990       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3991       v    += 16;
3992     }
3993     v        = aa + ai16;
3994     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3995     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3996     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3997     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3998     idt -= 4;
3999   }
4000   }
4001 #endif
4002 
4003   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4004   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4005   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4006   PetscFunctionReturn(0);
4007 }
4008 
4009 #undef __FUNCT__
4010 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4011 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4012 {
4013     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4014     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4015     PetscErrorCode    ierr;
4016     PetscInt          idx,jdx,idt;
4017     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4018     const MatScalar   *aa=a->a,*v;
4019     PetscScalar       *x;
4020     const PetscScalar *b;
4021     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4022 
4023     PetscFunctionBegin;
4024     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4025     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4026     /* forward solve the lower triangular */
4027     idx    = 0;
4028     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4029     for (i=1; i<n; i++) {
4030        v    = aa + bs2*ai[i];
4031        vi   = aj + ai[i];
4032        nz   = ai[i+1] - ai[i];
4033       idx   = bs*i;
4034        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4035       for(k=0;k<nz;k++) {
4036           jdx   = bs*vi[k];
4037           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4038           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4039           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4040           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4041 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4042 
4043           v   +=  bs2;
4044         }
4045 
4046        x[idx]   = s1;
4047        x[1+idx] = s2;
4048        x[2+idx] = s3;
4049        x[3+idx] = s4;
4050     }
4051 
4052    /* backward solve the upper triangular */
4053   for (i=n-1; i>=0; i--){
4054     v   = aa + bs2*(adiag[i+1]+1);
4055      vi  = aj + adiag[i+1]+1;
4056      nz  = adiag[i] - adiag[i+1]-1;
4057      idt = bs*i;
4058      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4059 
4060     for(k=0;k<nz;k++){
4061       idx   = bs*vi[k];
4062        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4063        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4064        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4065        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4066        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4067 
4068         v   +=  bs2;
4069     }
4070     /* x = inv_diagonal*x */
4071    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4072    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4073    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4074    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4075 
4076   }
4077 
4078   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4079   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4080   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4081   PetscFunctionReturn(0);
4082 }
4083 
4084 #undef __FUNCT__
4085 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4086 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4087 {
4088   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4089   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4090   PetscErrorCode ierr;
4091   PetscInt       *diag = a->diag;
4092   MatScalar      *aa=a->a;
4093   PetscScalar    *x,*b;
4094 
4095   PetscFunctionBegin;
4096   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4097   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4098 
4099   {
4100     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
4101     MatScalar  *v,*t=(MatScalar *)x;
4102     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
4103 
4104     /* forward solve the lower triangular */
4105     idx  = 0;
4106     t[0] = (MatScalar)b[0];
4107     t[1] = (MatScalar)b[1];
4108     t[2] = (MatScalar)b[2];
4109     t[3] = (MatScalar)b[3];
4110     for (i=1; i<n; i++) {
4111       v     =  aa      + 16*ai[i];
4112       vi    =  aj      + ai[i];
4113       nz    =  diag[i] - ai[i];
4114       idx   +=  4;
4115       s1 = (MatScalar)b[idx];
4116       s2 = (MatScalar)b[1+idx];
4117       s3 = (MatScalar)b[2+idx];
4118       s4 = (MatScalar)b[3+idx];
4119       while (nz--) {
4120         jdx = 4*(*vi++);
4121         x1  = t[jdx];
4122         x2  = t[1+jdx];
4123         x3  = t[2+jdx];
4124         x4  = t[3+jdx];
4125         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4126         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4127         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4128         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4129         v    += 16;
4130       }
4131       t[idx]   = s1;
4132       t[1+idx] = s2;
4133       t[2+idx] = s3;
4134       t[3+idx] = s4;
4135     }
4136     /* backward solve the upper triangular */
4137     idt = 4*(n-1);
4138     for (i=n-1; i>=0; i--){
4139       ai16 = 16*diag[i];
4140       v    = aa + ai16 + 16;
4141       vi   = aj + diag[i] + 1;
4142       nz   = ai[i+1] - diag[i] - 1;
4143       s1   = t[idt];
4144       s2   = t[1+idt];
4145       s3   = t[2+idt];
4146       s4   = t[3+idt];
4147       while (nz--) {
4148         idx = 4*(*vi++);
4149         x1  = (MatScalar)x[idx];
4150         x2  = (MatScalar)x[1+idx];
4151         x3  = (MatScalar)x[2+idx];
4152         x4  = (MatScalar)x[3+idx];
4153         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4154         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4155         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4156         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4157         v    += 16;
4158       }
4159       v        = aa + ai16;
4160       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4161       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4162       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4163       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4164       idt -= 4;
4165     }
4166   }
4167 
4168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4170   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4171   PetscFunctionReturn(0);
4172 }
4173 
4174 #if defined (PETSC_HAVE_SSE)
4175 
4176 #include PETSC_HAVE_SSE
4177 #undef __FUNCT__
4178 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4179 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4180 {
4181   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4182   unsigned short *aj=(unsigned short *)a->j;
4183   PetscErrorCode ierr;
4184   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4185   MatScalar      *aa=a->a;
4186   PetscScalar    *x,*b;
4187 
4188   PetscFunctionBegin;
4189   SSE_SCOPE_BEGIN;
4190   /*
4191      Note: This code currently uses demotion of double
4192      to float when performing the mixed-mode computation.
4193      This may not be numerically reasonable for all applications.
4194   */
4195   PREFETCH_NTA(aa+16*ai[1]);
4196 
4197   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4198   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4199   {
4200     /* x will first be computed in single precision then promoted inplace to double */
4201     MatScalar      *v,*t=(MatScalar *)x;
4202     int            nz,i,idt,ai16;
4203     unsigned int   jdx,idx;
4204     unsigned short *vi;
4205     /* Forward solve the lower triangular factor. */
4206 
4207     /* First block is the identity. */
4208     idx  = 0;
4209     CONVERT_DOUBLE4_FLOAT4(t,b);
4210     v    =  aa + 16*((unsigned int)ai[1]);
4211 
4212     for (i=1; i<n;) {
4213       PREFETCH_NTA(&v[8]);
4214       vi   =  aj      + ai[i];
4215       nz   =  diag[i] - ai[i];
4216       idx +=  4;
4217 
4218       /* Demote RHS from double to float. */
4219       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4220       LOAD_PS(&t[idx],XMM7);
4221 
4222       while (nz--) {
4223         PREFETCH_NTA(&v[16]);
4224         jdx = 4*((unsigned int)(*vi++));
4225 
4226         /* 4x4 Matrix-Vector product with negative accumulation: */
4227         SSE_INLINE_BEGIN_2(&t[jdx],v)
4228           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4229 
4230           /* First Column */
4231           SSE_COPY_PS(XMM0,XMM6)
4232           SSE_SHUFFLE(XMM0,XMM0,0x00)
4233           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4234           SSE_SUB_PS(XMM7,XMM0)
4235 
4236           /* Second Column */
4237           SSE_COPY_PS(XMM1,XMM6)
4238           SSE_SHUFFLE(XMM1,XMM1,0x55)
4239           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4240           SSE_SUB_PS(XMM7,XMM1)
4241 
4242           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4243 
4244           /* Third Column */
4245           SSE_COPY_PS(XMM2,XMM6)
4246           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4247           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4248           SSE_SUB_PS(XMM7,XMM2)
4249 
4250           /* Fourth Column */
4251           SSE_COPY_PS(XMM3,XMM6)
4252           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4253           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4254           SSE_SUB_PS(XMM7,XMM3)
4255         SSE_INLINE_END_2
4256 
4257         v  += 16;
4258       }
4259       v    =  aa + 16*ai[++i];
4260       PREFETCH_NTA(v);
4261       STORE_PS(&t[idx],XMM7);
4262     }
4263 
4264     /* Backward solve the upper triangular factor.*/
4265 
4266     idt  = 4*(n-1);
4267     ai16 = 16*diag[n-1];
4268     v    = aa + ai16 + 16;
4269     for (i=n-1; i>=0;){
4270       PREFETCH_NTA(&v[8]);
4271       vi = aj + diag[i] + 1;
4272       nz = ai[i+1] - diag[i] - 1;
4273 
4274       LOAD_PS(&t[idt],XMM7);
4275 
4276       while (nz--) {
4277         PREFETCH_NTA(&v[16]);
4278         idx = 4*((unsigned int)(*vi++));
4279 
4280         /* 4x4 Matrix-Vector Product with negative accumulation: */
4281         SSE_INLINE_BEGIN_2(&t[idx],v)
4282           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4283 
4284           /* First Column */
4285           SSE_COPY_PS(XMM0,XMM6)
4286           SSE_SHUFFLE(XMM0,XMM0,0x00)
4287           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4288           SSE_SUB_PS(XMM7,XMM0)
4289 
4290           /* Second Column */
4291           SSE_COPY_PS(XMM1,XMM6)
4292           SSE_SHUFFLE(XMM1,XMM1,0x55)
4293           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4294           SSE_SUB_PS(XMM7,XMM1)
4295 
4296           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4297 
4298           /* Third Column */
4299           SSE_COPY_PS(XMM2,XMM6)
4300           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4301           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4302           SSE_SUB_PS(XMM7,XMM2)
4303 
4304           /* Fourth Column */
4305           SSE_COPY_PS(XMM3,XMM6)
4306           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4307           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4308           SSE_SUB_PS(XMM7,XMM3)
4309         SSE_INLINE_END_2
4310         v  += 16;
4311       }
4312       v    = aa + ai16;
4313       ai16 = 16*diag[--i];
4314       PREFETCH_NTA(aa+ai16+16);
4315       /*
4316          Scale the result by the diagonal 4x4 block,
4317          which was inverted as part of the factorization
4318       */
4319       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4320         /* First Column */
4321         SSE_COPY_PS(XMM0,XMM7)
4322         SSE_SHUFFLE(XMM0,XMM0,0x00)
4323         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4324 
4325         /* Second Column */
4326         SSE_COPY_PS(XMM1,XMM7)
4327         SSE_SHUFFLE(XMM1,XMM1,0x55)
4328         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4329         SSE_ADD_PS(XMM0,XMM1)
4330 
4331         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4332 
4333         /* Third Column */
4334         SSE_COPY_PS(XMM2,XMM7)
4335         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4336         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4337         SSE_ADD_PS(XMM0,XMM2)
4338 
4339         /* Fourth Column */
4340         SSE_COPY_PS(XMM3,XMM7)
4341         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4342         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4343         SSE_ADD_PS(XMM0,XMM3)
4344 
4345         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4346       SSE_INLINE_END_3
4347 
4348       v    = aa + ai16 + 16;
4349       idt -= 4;
4350     }
4351 
4352     /* Convert t from single precision back to double precision (inplace)*/
4353     idt = 4*(n-1);
4354     for (i=n-1;i>=0;i--) {
4355       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4356       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4357       PetscScalar *xtemp=&x[idt];
4358       MatScalar   *ttemp=&t[idt];
4359       xtemp[3] = (PetscScalar)ttemp[3];
4360       xtemp[2] = (PetscScalar)ttemp[2];
4361       xtemp[1] = (PetscScalar)ttemp[1];
4362       xtemp[0] = (PetscScalar)ttemp[0];
4363       idt -= 4;
4364     }
4365 
4366   } /* End of artificial scope. */
4367   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4369   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4370   SSE_SCOPE_END;
4371   PetscFunctionReturn(0);
4372 }
4373 
4374 #undef __FUNCT__
4375 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4376 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4377 {
4378   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4379   int            *aj=a->j;
4380   PetscErrorCode ierr;
4381   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4382   MatScalar      *aa=a->a;
4383   PetscScalar    *x,*b;
4384 
4385   PetscFunctionBegin;
4386   SSE_SCOPE_BEGIN;
4387   /*
4388      Note: This code currently uses demotion of double
4389      to float when performing the mixed-mode computation.
4390      This may not be numerically reasonable for all applications.
4391   */
4392   PREFETCH_NTA(aa+16*ai[1]);
4393 
4394   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4395   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4396   {
4397     /* x will first be computed in single precision then promoted inplace to double */
4398     MatScalar *v,*t=(MatScalar *)x;
4399     int       nz,i,idt,ai16;
4400     int       jdx,idx;
4401     int       *vi;
4402     /* Forward solve the lower triangular factor. */
4403 
4404     /* First block is the identity. */
4405     idx  = 0;
4406     CONVERT_DOUBLE4_FLOAT4(t,b);
4407     v    =  aa + 16*ai[1];
4408 
4409     for (i=1; i<n;) {
4410       PREFETCH_NTA(&v[8]);
4411       vi   =  aj      + ai[i];
4412       nz   =  diag[i] - ai[i];
4413       idx +=  4;
4414 
4415       /* Demote RHS from double to float. */
4416       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4417       LOAD_PS(&t[idx],XMM7);
4418 
4419       while (nz--) {
4420         PREFETCH_NTA(&v[16]);
4421         jdx = 4*(*vi++);
4422 /*          jdx = *vi++; */
4423 
4424         /* 4x4 Matrix-Vector product with negative accumulation: */
4425         SSE_INLINE_BEGIN_2(&t[jdx],v)
4426           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4427 
4428           /* First Column */
4429           SSE_COPY_PS(XMM0,XMM6)
4430           SSE_SHUFFLE(XMM0,XMM0,0x00)
4431           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4432           SSE_SUB_PS(XMM7,XMM0)
4433 
4434           /* Second Column */
4435           SSE_COPY_PS(XMM1,XMM6)
4436           SSE_SHUFFLE(XMM1,XMM1,0x55)
4437           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4438           SSE_SUB_PS(XMM7,XMM1)
4439 
4440           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4441 
4442           /* Third Column */
4443           SSE_COPY_PS(XMM2,XMM6)
4444           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4445           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4446           SSE_SUB_PS(XMM7,XMM2)
4447 
4448           /* Fourth Column */
4449           SSE_COPY_PS(XMM3,XMM6)
4450           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4451           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4452           SSE_SUB_PS(XMM7,XMM3)
4453         SSE_INLINE_END_2
4454 
4455         v  += 16;
4456       }
4457       v    =  aa + 16*ai[++i];
4458       PREFETCH_NTA(v);
4459       STORE_PS(&t[idx],XMM7);
4460     }
4461 
4462     /* Backward solve the upper triangular factor.*/
4463 
4464     idt  = 4*(n-1);
4465     ai16 = 16*diag[n-1];
4466     v    = aa + ai16 + 16;
4467     for (i=n-1; i>=0;){
4468       PREFETCH_NTA(&v[8]);
4469       vi = aj + diag[i] + 1;
4470       nz = ai[i+1] - diag[i] - 1;
4471 
4472       LOAD_PS(&t[idt],XMM7);
4473 
4474       while (nz--) {
4475         PREFETCH_NTA(&v[16]);
4476         idx = 4*(*vi++);
4477 /*          idx = *vi++; */
4478 
4479         /* 4x4 Matrix-Vector Product with negative accumulation: */
4480         SSE_INLINE_BEGIN_2(&t[idx],v)
4481           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4482 
4483           /* First Column */
4484           SSE_COPY_PS(XMM0,XMM6)
4485           SSE_SHUFFLE(XMM0,XMM0,0x00)
4486           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4487           SSE_SUB_PS(XMM7,XMM0)
4488 
4489           /* Second Column */
4490           SSE_COPY_PS(XMM1,XMM6)
4491           SSE_SHUFFLE(XMM1,XMM1,0x55)
4492           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4493           SSE_SUB_PS(XMM7,XMM1)
4494 
4495           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4496 
4497           /* Third Column */
4498           SSE_COPY_PS(XMM2,XMM6)
4499           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4500           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4501           SSE_SUB_PS(XMM7,XMM2)
4502 
4503           /* Fourth Column */
4504           SSE_COPY_PS(XMM3,XMM6)
4505           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4506           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4507           SSE_SUB_PS(XMM7,XMM3)
4508         SSE_INLINE_END_2
4509         v  += 16;
4510       }
4511       v    = aa + ai16;
4512       ai16 = 16*diag[--i];
4513       PREFETCH_NTA(aa+ai16+16);
4514       /*
4515          Scale the result by the diagonal 4x4 block,
4516          which was inverted as part of the factorization
4517       */
4518       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4519         /* First Column */
4520         SSE_COPY_PS(XMM0,XMM7)
4521         SSE_SHUFFLE(XMM0,XMM0,0x00)
4522         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4523 
4524         /* Second Column */
4525         SSE_COPY_PS(XMM1,XMM7)
4526         SSE_SHUFFLE(XMM1,XMM1,0x55)
4527         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4528         SSE_ADD_PS(XMM0,XMM1)
4529 
4530         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4531 
4532         /* Third Column */
4533         SSE_COPY_PS(XMM2,XMM7)
4534         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4535         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4536         SSE_ADD_PS(XMM0,XMM2)
4537 
4538         /* Fourth Column */
4539         SSE_COPY_PS(XMM3,XMM7)
4540         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4541         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4542         SSE_ADD_PS(XMM0,XMM3)
4543 
4544         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4545       SSE_INLINE_END_3
4546 
4547       v    = aa + ai16 + 16;
4548       idt -= 4;
4549     }
4550 
4551     /* Convert t from single precision back to double precision (inplace)*/
4552     idt = 4*(n-1);
4553     for (i=n-1;i>=0;i--) {
4554       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4555       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4556       PetscScalar *xtemp=&x[idt];
4557       MatScalar   *ttemp=&t[idt];
4558       xtemp[3] = (PetscScalar)ttemp[3];
4559       xtemp[2] = (PetscScalar)ttemp[2];
4560       xtemp[1] = (PetscScalar)ttemp[1];
4561       xtemp[0] = (PetscScalar)ttemp[0];
4562       idt -= 4;
4563     }
4564 
4565   } /* End of artificial scope. */
4566   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4569   SSE_SCOPE_END;
4570   PetscFunctionReturn(0);
4571 }
4572 
4573 #endif
4574 
4575 #undef __FUNCT__
4576 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4577 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4578 {
4579   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4580   IS                iscol=a->col,isrow=a->row;
4581   PetscErrorCode    ierr;
4582   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4583   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4584   const MatScalar   *aa=a->a,*v;
4585   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4586   const PetscScalar *b;
4587 
4588   PetscFunctionBegin;
4589   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4590   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4591   t  = a->solve_work;
4592 
4593   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4594   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4595 
4596   /* forward solve the lower triangular */
4597   idx    = 3*(*r++);
4598   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4599   for (i=1; i<n; i++) {
4600     v     = aa + 9*ai[i];
4601     vi    = aj + ai[i];
4602     nz    = diag[i] - ai[i];
4603     idx   = 3*(*r++);
4604     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4605     while (nz--) {
4606       idx   = 3*(*vi++);
4607       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4608       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4609       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4610       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4611       v += 9;
4612     }
4613     idx = 3*i;
4614     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4615   }
4616   /* backward solve the upper triangular */
4617   for (i=n-1; i>=0; i--){
4618     v    = aa + 9*diag[i] + 9;
4619     vi   = aj + diag[i] + 1;
4620     nz   = ai[i+1] - diag[i] - 1;
4621     idt  = 3*i;
4622     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4623     while (nz--) {
4624       idx   = 3*(*vi++);
4625       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4626       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4627       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4628       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4629       v += 9;
4630     }
4631     idc = 3*(*c--);
4632     v   = aa + 9*diag[i];
4633     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4634     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4635     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4636   }
4637   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4638   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4639   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4641   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4642   PetscFunctionReturn(0);
4643 }
4644 
4645 #undef __FUNCT__
4646 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4647 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4648 {
4649   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4650   IS                iscol=a->col,isrow=a->row;
4651   PetscErrorCode    ierr;
4652   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4653   const PetscInt    *r,*c,*rout,*cout;
4654   const MatScalar   *aa=a->a,*v;
4655   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4656   const PetscScalar *b;
4657 
4658   PetscFunctionBegin;
4659   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4660   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4661   t  = a->solve_work;
4662 
4663   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4664   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4665 
4666   /* forward solve the lower triangular */
4667   idx    = 3*r[0];
4668   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4669   for (i=1; i<n; i++) {
4670     v     = aa + 9*ai[i];
4671     vi    = aj + ai[i];
4672     nz    = ai[i+1] - ai[i];
4673     idx   = 3*r[i];
4674     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4675     for(m=0;m<nz;m++){
4676       idx   = 3*vi[m];
4677       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4678       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4679       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4680       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4681       v += 9;
4682     }
4683     idx = 3*i;
4684     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4685   }
4686   /* backward solve the upper triangular */
4687   for (i=n-1; i>=0; i--){
4688     v    = aa + 9*(adiag[i+1]+1);
4689     vi   = aj + adiag[i+1]+1;
4690     nz   = adiag[i] - adiag[i+1] - 1;
4691     idt  = 3*i;
4692     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4693     for(m=0;m<nz;m++){
4694       idx   = 3*vi[m];
4695       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4696       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4697       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4698       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4699       v += 9;
4700     }
4701     idc = 3*c[i];
4702     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4703     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4704     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4705   }
4706   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4707   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4708   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4710   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4711   PetscFunctionReturn(0);
4712 }
4713 
4714 /*
4715       Special case where the matrix was ILU(0) factored in the natural
4716    ordering. This eliminates the need for the column and row permutation.
4717 */
4718 #undef __FUNCT__
4719 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4720 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4721 {
4722   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4723   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4724   PetscErrorCode    ierr;
4725   const PetscInt    *diag = a->diag,*vi;
4726   const MatScalar   *aa=a->a,*v;
4727   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4728   const PetscScalar *b;
4729   PetscInt          jdx,idt,idx,nz,i;
4730 
4731   PetscFunctionBegin;
4732   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4733   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4734 
4735   /* forward solve the lower triangular */
4736   idx    = 0;
4737   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4738   for (i=1; i<n; i++) {
4739     v     =  aa      + 9*ai[i];
4740     vi    =  aj      + ai[i];
4741     nz    =  diag[i] - ai[i];
4742     idx   +=  3;
4743     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4744     while (nz--) {
4745       jdx   = 3*(*vi++);
4746       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4747       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4748       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4749       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4750       v    += 9;
4751     }
4752     x[idx]   = s1;
4753     x[1+idx] = s2;
4754     x[2+idx] = s3;
4755   }
4756   /* backward solve the upper triangular */
4757   for (i=n-1; i>=0; i--){
4758     v    = aa + 9*diag[i] + 9;
4759     vi   = aj + diag[i] + 1;
4760     nz   = ai[i+1] - diag[i] - 1;
4761     idt  = 3*i;
4762     s1 = x[idt];  s2 = x[1+idt];
4763     s3 = x[2+idt];
4764     while (nz--) {
4765       idx   = 3*(*vi++);
4766       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4767       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4768       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4769       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4770       v    += 9;
4771     }
4772     v        = aa +  9*diag[i];
4773     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4774     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4775     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4776   }
4777 
4778   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4779   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4780   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4781   PetscFunctionReturn(0);
4782 }
4783 
4784 #undef __FUNCT__
4785 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4786 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4787 {
4788     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4789     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4790     PetscErrorCode    ierr;
4791     PetscInt          idx,jdx,idt;
4792     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4793     const MatScalar   *aa=a->a,*v;
4794     PetscScalar       *x;
4795     const PetscScalar *b;
4796     PetscScalar        s1,s2,s3,x1,x2,x3;
4797 
4798     PetscFunctionBegin;
4799     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4800     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4801     /* forward solve the lower triangular */
4802     idx    = 0;
4803     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4804     for (i=1; i<n; i++) {
4805        v    = aa + bs2*ai[i];
4806        vi   = aj + ai[i];
4807        nz   = ai[i+1] - ai[i];
4808       idx   = bs*i;
4809        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4810       for(k=0;k<nz;k++){
4811          jdx   = bs*vi[k];
4812           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4813           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4814           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4815           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4816 
4817           v   +=  bs2;
4818         }
4819 
4820        x[idx]   = s1;
4821        x[1+idx] = s2;
4822        x[2+idx] = s3;
4823     }
4824 
4825    /* backward solve the upper triangular */
4826   for (i=n-1; i>=0; i--){
4827     v   = aa + bs2*(adiag[i+1]+1);
4828      vi  = aj + adiag[i+1]+1;
4829      nz  = adiag[i] - adiag[i+1]-1;
4830      idt = bs*i;
4831      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4832 
4833      for(k=0;k<nz;k++){
4834        idx   = bs*vi[k];
4835        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4836        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4837        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4838        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4839 
4840         v   +=  bs2;
4841     }
4842     /* x = inv_diagonal*x */
4843    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4844    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4845    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4846 
4847   }
4848 
4849   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4851   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4852   PetscFunctionReturn(0);
4853 }
4854 
4855 #undef __FUNCT__
4856 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4857 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4858 {
4859   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4860   IS                iscol=a->col,isrow=a->row;
4861   PetscErrorCode    ierr;
4862   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4863   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4864   const MatScalar   *aa=a->a,*v;
4865   PetscScalar       *x,s1,s2,x1,x2,*t;
4866   const PetscScalar *b;
4867 
4868   PetscFunctionBegin;
4869   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4870   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4871   t  = a->solve_work;
4872 
4873   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4874   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4875 
4876   /* forward solve the lower triangular */
4877   idx    = 2*(*r++);
4878   t[0] = b[idx]; t[1] = b[1+idx];
4879   for (i=1; i<n; i++) {
4880     v     = aa + 4*ai[i];
4881     vi    = aj + ai[i];
4882     nz    = diag[i] - ai[i];
4883     idx   = 2*(*r++);
4884     s1  = b[idx]; s2 = b[1+idx];
4885     while (nz--) {
4886       idx   = 2*(*vi++);
4887       x1    = t[idx]; x2 = t[1+idx];
4888       s1 -= v[0]*x1 + v[2]*x2;
4889       s2 -= v[1]*x1 + v[3]*x2;
4890       v += 4;
4891     }
4892     idx = 2*i;
4893     t[idx] = s1; t[1+idx] = s2;
4894   }
4895   /* backward solve the upper triangular */
4896   for (i=n-1; i>=0; i--){
4897     v    = aa + 4*diag[i] + 4;
4898     vi   = aj + diag[i] + 1;
4899     nz   = ai[i+1] - diag[i] - 1;
4900     idt  = 2*i;
4901     s1 = t[idt]; s2 = t[1+idt];
4902     while (nz--) {
4903       idx   = 2*(*vi++);
4904       x1    = t[idx]; x2 = t[1+idx];
4905       s1 -= v[0]*x1 + v[2]*x2;
4906       s2 -= v[1]*x1 + v[3]*x2;
4907       v += 4;
4908     }
4909     idc = 2*(*c--);
4910     v   = aa + 4*diag[i];
4911     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4912     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4913   }
4914   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4915   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4916   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4917   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4918   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4919   PetscFunctionReturn(0);
4920 }
4921 
4922 #undef __FUNCT__
4923 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4924 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4925 {
4926   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4927   IS                iscol=a->col,isrow=a->row;
4928   PetscErrorCode    ierr;
4929   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4930   const PetscInt    *r,*c,*rout,*cout;
4931   const MatScalar   *aa=a->a,*v;
4932   PetscScalar       *x,s1,s2,x1,x2,*t;
4933   const PetscScalar *b;
4934 
4935   PetscFunctionBegin;
4936   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4937   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4938   t  = a->solve_work;
4939 
4940   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4941   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4942 
4943   /* forward solve the lower triangular */
4944   idx    = 2*r[0];
4945   t[0] = b[idx]; t[1] = b[1+idx];
4946   for (i=1; i<n; i++) {
4947     v     = aa + 4*ai[i];
4948     vi    = aj + ai[i];
4949     nz    = ai[i+1] - ai[i];
4950     idx   = 2*r[i];
4951     s1  = b[idx]; s2 = b[1+idx];
4952     for(m=0;m<nz;m++){
4953       jdx   = 2*vi[m];
4954       x1    = t[jdx]; x2 = t[1+jdx];
4955       s1 -= v[0]*x1 + v[2]*x2;
4956       s2 -= v[1]*x1 + v[3]*x2;
4957       v += 4;
4958     }
4959     idx = 2*i;
4960     t[idx] = s1; t[1+idx] = s2;
4961   }
4962   /* backward solve the upper triangular */
4963   for (i=n-1; i>=0; i--){
4964     v    = aa + 4*(adiag[i+1]+1);
4965     vi   = aj + adiag[i+1]+1;
4966     nz   = adiag[i] - adiag[i+1] - 1;
4967     idt  = 2*i;
4968     s1 = t[idt]; s2 = t[1+idt];
4969     for(m=0;m<nz;m++){
4970       idx   = 2*vi[m];
4971       x1    = t[idx]; x2 = t[1+idx];
4972       s1 -= v[0]*x1 + v[2]*x2;
4973       s2 -= v[1]*x1 + v[3]*x2;
4974       v += 4;
4975     }
4976     idc = 2*c[i];
4977     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4978     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4979   }
4980   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4981   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4982   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4983   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4984   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4985   PetscFunctionReturn(0);
4986 }
4987 
4988 /*
4989       Special case where the matrix was ILU(0) factored in the natural
4990    ordering. This eliminates the need for the column and row permutation.
4991 */
4992 #undef __FUNCT__
4993 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
4994 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4995 {
4996   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4997   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4998   PetscErrorCode    ierr;
4999   PetscInt          *diag = a->diag;
5000   const MatScalar   *aa=a->a,*v;
5001   PetscScalar       *x,s1,s2,x1,x2;
5002   const PetscScalar *b;
5003   PetscInt          jdx,idt,idx,nz,*vi,i;
5004 
5005   PetscFunctionBegin;
5006   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5007   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5008 
5009   /* forward solve the lower triangular */
5010   idx    = 0;
5011   x[0]   = b[0]; x[1] = b[1];
5012   for (i=1; i<n; i++) {
5013     v     =  aa      + 4*ai[i];
5014     vi    =  aj      + ai[i];
5015     nz    =  diag[i] - ai[i];
5016     idx   +=  2;
5017     s1  =  b[idx];s2 = b[1+idx];
5018     while (nz--) {
5019       jdx   = 2*(*vi++);
5020       x1    = x[jdx];x2 = x[1+jdx];
5021       s1 -= v[0]*x1 + v[2]*x2;
5022       s2 -= v[1]*x1 + v[3]*x2;
5023       v    += 4;
5024     }
5025     x[idx]   = s1;
5026     x[1+idx] = s2;
5027   }
5028   /* backward solve the upper triangular */
5029   for (i=n-1; i>=0; i--){
5030     v    = aa + 4*diag[i] + 4;
5031     vi   = aj + diag[i] + 1;
5032     nz   = ai[i+1] - diag[i] - 1;
5033     idt  = 2*i;
5034     s1 = x[idt];  s2 = x[1+idt];
5035     while (nz--) {
5036       idx   = 2*(*vi++);
5037       x1    = x[idx];   x2 = x[1+idx];
5038       s1 -= v[0]*x1 + v[2]*x2;
5039       s2 -= v[1]*x1 + v[3]*x2;
5040       v    += 4;
5041     }
5042     v        = aa +  4*diag[i];
5043     x[idt]   = v[0]*s1 + v[2]*s2;
5044     x[1+idt] = v[1]*s1 + v[3]*s2;
5045   }
5046 
5047   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5049   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5050   PetscFunctionReturn(0);
5051 }
5052 
5053 #undef __FUNCT__
5054 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5055 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5056 {
5057     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5058     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
5059     PetscErrorCode    ierr;
5060     PetscInt          jdx;
5061     const MatScalar   *aa=a->a,*v;
5062     PetscScalar       *x,s1,s2,x1,x2;
5063     const PetscScalar *b;
5064 
5065     PetscFunctionBegin;
5066     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5067     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5068     /* forward solve the lower triangular */
5069     idx    = 0;
5070     x[0] = b[idx]; x[1] = b[1+idx];
5071     for (i=1; i<n; i++) {
5072         v   = aa + 4*ai[i];
5073        vi   = aj + ai[i];
5074        nz   = ai[i+1] - ai[i];
5075        idx  = 2*i;
5076        s1   = b[idx];s2 = b[1+idx];
5077       for(k=0;k<nz;k++){
5078          jdx   = 2*vi[k];
5079           x1    = x[jdx];x2 = x[1+jdx];
5080           s1   -= v[0]*x1 + v[2]*x2;
5081           s2   -= v[1]*x1 + v[3]*x2;
5082            v   +=  4;
5083         }
5084        x[idx]   = s1;
5085        x[1+idx] = s2;
5086     }
5087 
5088    /* backward solve the upper triangular */
5089   for (i=n-1; i>=0; i--){
5090      v   = aa + 4*(adiag[i+1]+1);
5091      vi  = aj + adiag[i+1]+1;
5092      nz  = adiag[i] - adiag[i+1]-1;
5093      idt = 2*i;
5094      s1 = x[idt];  s2 = x[1+idt];
5095      for(k=0;k<nz;k++){
5096       idx   = 2*vi[k];
5097        x1    = x[idx];   x2 = x[1+idx];
5098        s1 -= v[0]*x1 + v[2]*x2;
5099        s2 -= v[1]*x1 + v[3]*x2;
5100          v    += 4;
5101     }
5102     /* x = inv_diagonal*x */
5103    x[idt]   = v[0]*s1 + v[2]*s2;
5104    x[1+idt] = v[1]*s1 + v[3]*s2;
5105   }
5106 
5107   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5109   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5110   PetscFunctionReturn(0);
5111 }
5112 
5113 #undef __FUNCT__
5114 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5115 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5116 {
5117   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5118   IS             iscol=a->col,isrow=a->row;
5119   PetscErrorCode ierr;
5120   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
5121   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5122   MatScalar      *aa=a->a,*v;
5123   PetscScalar    *x,*b,s1,*t;
5124 
5125   PetscFunctionBegin;
5126   if (!n) PetscFunctionReturn(0);
5127 
5128   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5130   t  = a->solve_work;
5131 
5132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5134 
5135   /* forward solve the lower triangular */
5136   t[0] = b[*r++];
5137   for (i=1; i<n; i++) {
5138     v     = aa + ai[i];
5139     vi    = aj + ai[i];
5140     nz    = diag[i] - ai[i];
5141     s1  = b[*r++];
5142     while (nz--) {
5143       s1 -= (*v++)*t[*vi++];
5144     }
5145     t[i] = s1;
5146   }
5147   /* backward solve the upper triangular */
5148   for (i=n-1; i>=0; i--){
5149     v    = aa + diag[i] + 1;
5150     vi   = aj + diag[i] + 1;
5151     nz   = ai[i+1] - diag[i] - 1;
5152     s1 = t[i];
5153     while (nz--) {
5154       s1 -= (*v++)*t[*vi++];
5155     }
5156     x[*c--] = t[i] = aa[diag[i]]*s1;
5157   }
5158 
5159   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5160   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5161   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5162   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5163   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5164   PetscFunctionReturn(0);
5165 }
5166 /*
5167       Special case where the matrix was ILU(0) factored in the natural
5168    ordering. This eliminates the need for the column and row permutation.
5169 */
5170 #undef __FUNCT__
5171 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5172 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5173 {
5174   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5175   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5176   PetscErrorCode ierr;
5177   PetscInt       *diag = a->diag;
5178   MatScalar      *aa=a->a;
5179   PetscScalar    *x,*b;
5180   PetscScalar    s1,x1;
5181   MatScalar      *v;
5182   PetscInt       jdx,idt,idx,nz,*vi,i;
5183 
5184   PetscFunctionBegin;
5185   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5186   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5187 
5188   /* forward solve the lower triangular */
5189   idx    = 0;
5190   x[0]   = b[0];
5191   for (i=1; i<n; i++) {
5192     v     =  aa      + ai[i];
5193     vi    =  aj      + ai[i];
5194     nz    =  diag[i] - ai[i];
5195     idx   +=  1;
5196     s1  =  b[idx];
5197     while (nz--) {
5198       jdx   = *vi++;
5199       x1    = x[jdx];
5200       s1 -= v[0]*x1;
5201       v    += 1;
5202     }
5203     x[idx]   = s1;
5204   }
5205   /* backward solve the upper triangular */
5206   for (i=n-1; i>=0; i--){
5207     v    = aa + diag[i] + 1;
5208     vi   = aj + diag[i] + 1;
5209     nz   = ai[i+1] - diag[i] - 1;
5210     idt  = i;
5211     s1 = x[idt];
5212     while (nz--) {
5213       idx   = *vi++;
5214       x1    = x[idx];
5215       s1 -= v[0]*x1;
5216       v    += 1;
5217     }
5218     v        = aa +  diag[i];
5219     x[idt]   = v[0]*s1;
5220   }
5221   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5222   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5223   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5224   PetscFunctionReturn(0);
5225 }
5226 
5227 /* ----------------------------------------------------------------*/
5228 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5229 
5230 #undef __FUNCT__
5231 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5232 /*
5233    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5234 */
5235 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5236 {
5237   Mat             C=B;
5238   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5239   PetscErrorCode  ierr;
5240   PetscInt        i,j,k,ipvt[15];
5241   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5242   PetscInt        nz,nzL,row;
5243   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5244   const MatScalar *v,*aa=a->a;
5245   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5246 
5247   PetscFunctionBegin;
5248 
5249   /* generate work space needed by the factorization */
5250   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5251   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5252 
5253   for (i=0; i<n; i++){
5254     /* zero rtmp */
5255     /* L part */
5256     nz    = bi[i+1] - bi[i];
5257     bjtmp = bj + bi[i];
5258     for  (j=0; j<nz; j++){
5259       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5260     }
5261 
5262     /* U part */
5263     nz = bdiag[i] - bdiag[i+1];
5264     bjtmp = bj + bdiag[i+1]+1;
5265     for  (j=0; j<nz; j++){
5266       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5267     }
5268 
5269     /* load in initial (unfactored row) */
5270     nz    = ai[i+1] - ai[i];
5271     ajtmp = aj + ai[i];
5272     v     = aa + bs2*ai[i];
5273     for (j=0; j<nz; j++) {
5274       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5275     }
5276 
5277     /* elimination */
5278     bjtmp = bj + bi[i];
5279     nzL   = bi[i+1] - bi[i];
5280     for(k=0;k < nzL;k++) {
5281       row = bjtmp[k];
5282       pc = rtmp + bs2*row;
5283       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5284       if (flg) {
5285         pv = b->a + bs2*bdiag[row];
5286 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5287 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
5288 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5289         pv = b->a + bs2*(bdiag[row+1]+1);
5290         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5291         for (j=0; j<nz; j++) {
5292           vv   = rtmp + bs2*pj[j];
5293           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5294 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
5295 	  pv  += bs2;
5296         }
5297         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5298       }
5299     }
5300 
5301     /* finished row so stick it into b->a */
5302     /* L part */
5303     pv   = b->a + bs2*bi[i] ;
5304     pj   = b->j + bi[i] ;
5305     nz   = bi[i+1] - bi[i];
5306     for (j=0; j<nz; j++) {
5307       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5308     }
5309 
5310     /* Mark diagonal and invert diagonal for simplier triangular solves */
5311     pv   = b->a + bs2*bdiag[i];
5312     pj   = b->j + bdiag[i];
5313     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5314     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5315     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftinblocks);CHKERRQ(ierr);
5316 
5317     /* U part */
5318     pv = b->a + bs2*(bdiag[i+1]+1);
5319     pj = b->j + bdiag[i+1]+1;
5320     nz = bdiag[i] - bdiag[i+1] - 1;
5321     for (j=0; j<nz; j++){
5322       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5323     }
5324   }
5325 
5326   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5327   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering;
5328   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5329   C->assembled = PETSC_TRUE;
5330   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5331   PetscFunctionReturn(0);
5332 }
5333 
5334 #undef __FUNCT__
5335 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5336 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5337 {
5338   Mat            C=B;
5339   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5340   IS             isrow = b->row,isicol = b->icol;
5341   PetscErrorCode ierr;
5342   const PetscInt *r,*ic,*ics;
5343   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5344   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5345   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5346   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5347   MatScalar      *v_work;
5348   PetscTruth     col_identity,row_identity,both_identity;
5349 
5350   PetscFunctionBegin;
5351   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5352   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5353 
5354   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5355   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5356   ics  = ic;
5357 
5358   /* generate work space needed by dense LU factorization */
5359   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5360 
5361   for (i=0; i<n; i++){
5362     /* zero rtmp */
5363     /* L part */
5364     nz    = bi[i+1] - bi[i];
5365     bjtmp = bj + bi[i];
5366     for  (j=0; j<nz; j++){
5367       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5368     }
5369 
5370     /* U part */
5371     nz = bdiag[i] - bdiag[i+1];
5372     bjtmp = bj + bdiag[i+1]+1;
5373     for  (j=0; j<nz; j++){
5374       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5375     }
5376 
5377     /* load in initial (unfactored row) */
5378     nz    = ai[r[i]+1] - ai[r[i]];
5379     ajtmp = aj + ai[r[i]];
5380     v     = aa + bs2*ai[r[i]];
5381     for (j=0; j<nz; j++) {
5382       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5383     }
5384 
5385     /* elimination */
5386     bjtmp = bj + bi[i];
5387     nzL   = bi[i+1] - bi[i];
5388     for(k=0;k < nzL;k++) {
5389       row = bjtmp[k];
5390       pc = rtmp + bs2*row;
5391       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5392       if (flg) {
5393         pv         = b->a + bs2*bdiag[row];
5394         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5395         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5396         pv         = b->a + bs2*(bdiag[row+1]+1);
5397         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5398         for (j=0; j<nz; j++) {
5399           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5400         }
5401         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5402       }
5403     }
5404 
5405     /* finished row so stick it into b->a */
5406     /* L part */
5407     pv   = b->a + bs2*bi[i] ;
5408     pj   = b->j + bi[i] ;
5409     nz   = bi[i+1] - bi[i];
5410     for (j=0; j<nz; j++) {
5411       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5412     }
5413 
5414     /* Mark diagonal and invert diagonal for simplier triangular solves */
5415     pv  = b->a + bs2*bdiag[i];
5416     pj  = b->j + bdiag[i];
5417     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5418     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5419     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5420 
5421     /* U part */
5422     pv = b->a + bs2*(bdiag[i+1]+1);
5423     pj = b->j + bdiag[i+1]+1;
5424     nz = bdiag[i] - bdiag[i+1] - 1;
5425     for (j=0; j<nz; j++){
5426       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5427     }
5428   }
5429 
5430   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5431   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5432   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5433   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5434 
5435   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5436   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5437   both_identity = (PetscTruth) (row_identity && col_identity);
5438   if (both_identity){
5439     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5440   } else {
5441     C->ops->solve = MatSolve_SeqBAIJ_N;
5442   }
5443   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5444 
5445   C->assembled = PETSC_TRUE;
5446   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5447   PetscFunctionReturn(0);
5448 }
5449 
5450 /*
5451    ilu(0) with natural ordering under new data structure.
5452    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5453    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5454 */
5455 
5456 #undef __FUNCT__
5457 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5458 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5459 {
5460 
5461   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5462   PetscErrorCode     ierr;
5463   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5464   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5465 
5466   PetscFunctionBegin;
5467   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5468   b    = (Mat_SeqBAIJ*)(fact)->data;
5469 
5470   /* allocate matrix arrays for new data structure */
5471   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5472   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5473   b->singlemalloc = PETSC_TRUE;
5474   if (!b->diag){
5475     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5476     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5477   }
5478   bdiag = b->diag;
5479 
5480   if (n > 0) {
5481     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5482   }
5483 
5484   /* set bi and bj with new data structure */
5485   bi = b->i;
5486   bj = b->j;
5487 
5488   /* L part */
5489   bi[0] = 0;
5490   for (i=0; i<n; i++){
5491     nz = adiag[i] - ai[i];
5492     bi[i+1] = bi[i] + nz;
5493     aj = a->j + ai[i];
5494     for (j=0; j<nz; j++){
5495       *bj = aj[j]; bj++;
5496     }
5497   }
5498 
5499   /* U part */
5500   bi_temp = bi[n];
5501   bdiag[n] = bi[n]-1;
5502   for (i=n-1; i>=0; i--){
5503     nz = ai[i+1] - adiag[i] - 1;
5504     bi_temp = bi_temp + nz + 1;
5505     aj = a->j + adiag[i] + 1;
5506     for (j=0; j<nz; j++){
5507       *bj = aj[j]; bj++;
5508     }
5509     /* diag[i] */
5510     *bj = i; bj++;
5511     bdiag[i] = bi_temp - 1;
5512   }
5513   PetscFunctionReturn(0);
5514 }
5515 
5516 #undef __FUNCT__
5517 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5518 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5519 {
5520   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5521   IS                 isicol;
5522   PetscErrorCode     ierr;
5523   const PetscInt     *r,*ic;
5524   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5525   PetscInt           *bi,*cols,nnz,*cols_lvl;
5526   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5527   PetscInt           i,levels,diagonal_fill;
5528   PetscTruth         col_identity,row_identity,both_identity;
5529   PetscReal          f;
5530   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5531   PetscBT            lnkbt;
5532   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5533   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5534   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5535   PetscTruth         missing;
5536   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5537   PetscTruth         olddatastruct = PETSC_FALSE;
5538 
5539   PetscFunctionBegin;
5540   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr);
5541   if (olddatastruct){
5542     ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5543     PetscFunctionReturn(0);
5544   }
5545   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5546   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5547   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5548 
5549   f             = info->fill;
5550   levels        = (PetscInt)info->levels;
5551   diagonal_fill = (PetscInt)info->diagonal_fill;
5552   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5553 
5554   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5555   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5556   both_identity = (PetscTruth) (row_identity && col_identity);
5557 
5558   if (!levels && both_identity) {
5559     /* special case: ilu(0) with natural ordering */
5560     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5561     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5562 
5563     fact->factor = MAT_FACTOR_ILU;
5564     (fact)->info.factor_mallocs    = 0;
5565     (fact)->info.fill_ratio_given  = info->fill;
5566     (fact)->info.fill_ratio_needed = 1.0;
5567     b                = (Mat_SeqBAIJ*)(fact)->data;
5568     b->row           = isrow;
5569     b->col           = iscol;
5570     b->icol          = isicol;
5571     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5572     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5573     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5574     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5575     PetscFunctionReturn(0);
5576   }
5577 
5578   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5579   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5580 
5581   /* get new row pointers */
5582   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5583   bi[0] = 0;
5584   /* bdiag is location of diagonal in factor */
5585   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5586   bdiag[0]  = 0;
5587 
5588   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5589 
5590   /* create a linked list for storing column indices of the active row */
5591   nlnk = n + 1;
5592   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5593 
5594   /* initial FreeSpace size is f*(ai[n]+1) */
5595   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5596   current_space = free_space;
5597   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5598   current_space_lvl = free_space_lvl;
5599 
5600   for (i=0; i<n; i++) {
5601     nzi = 0;
5602     /* copy current row into linked list */
5603     nnz  = ai[r[i]+1] - ai[r[i]];
5604     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5605     cols = aj + ai[r[i]];
5606     lnk[i] = -1; /* marker to indicate if diagonal exists */
5607     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5608     nzi += nlnk;
5609 
5610     /* make sure diagonal entry is included */
5611     if (diagonal_fill && lnk[i] == -1) {
5612       fm = n;
5613       while (lnk[fm] < i) fm = lnk[fm];
5614       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5615       lnk[fm]    = i;
5616       lnk_lvl[i] = 0;
5617       nzi++; dcount++;
5618     }
5619 
5620     /* add pivot rows into the active row */
5621     nzbd = 0;
5622     prow = lnk[n];
5623     while (prow < i) {
5624       nnz      = bdiag[prow];
5625       cols     = bj_ptr[prow] + nnz + 1;
5626       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5627       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5628       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5629       nzi += nlnk;
5630       prow = lnk[prow];
5631       nzbd++;
5632     }
5633     bdiag[i] = nzbd;
5634     bi[i+1]  = bi[i] + nzi;
5635 
5636     /* if free space is not available, make more free space */
5637     if (current_space->local_remaining<nzi) {
5638       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5639       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5640       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5641       reallocs++;
5642     }
5643 
5644     /* copy data into free_space and free_space_lvl, then initialize lnk */
5645     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5646     bj_ptr[i]    = current_space->array;
5647     bjlvl_ptr[i] = current_space_lvl->array;
5648 
5649     /* make sure the active row i has diagonal entry */
5650     if (*(bj_ptr[i]+bdiag[i]) != i) {
5651       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5652     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5653     }
5654 
5655     current_space->array           += nzi;
5656     current_space->local_used      += nzi;
5657     current_space->local_remaining -= nzi;
5658     current_space_lvl->array           += nzi;
5659     current_space_lvl->local_used      += nzi;
5660     current_space_lvl->local_remaining -= nzi;
5661   }
5662 
5663   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5664   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5665 
5666   /* destroy list of free space and other temporary arrays */
5667   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5668 
5669   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5670   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5671 
5672   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5673   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5674   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5675 
5676 #if defined(PETSC_USE_INFO)
5677   {
5678     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5679     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5680     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5681     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5682     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5683     if (diagonal_fill) {
5684       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5685     }
5686   }
5687 #endif
5688 
5689   /* put together the new matrix */
5690   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5691   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5692   b = (Mat_SeqBAIJ*)(fact)->data;
5693   b->free_a       = PETSC_TRUE;
5694   b->free_ij      = PETSC_TRUE;
5695   b->singlemalloc = PETSC_FALSE;
5696   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5697   b->j          = bj;
5698   b->i          = bi;
5699   b->diag       = bdiag;
5700   b->free_diag  = PETSC_TRUE;
5701   b->ilen       = 0;
5702   b->imax       = 0;
5703   b->row        = isrow;
5704   b->col        = iscol;
5705   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5706   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5707   b->icol       = isicol;
5708   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5709   /* In b structure:  Free imax, ilen, old a, old j.
5710      Allocate bdiag, solve_work, new a, new j */
5711   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5712   b->maxnz = b->nz = bdiag[0]+1;
5713   fact->info.factor_mallocs    = reallocs;
5714   fact->info.fill_ratio_given  = f;
5715   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5716   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5717   PetscFunctionReturn(0);
5718 }
5719 
5720 
5721 /*
5722      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5723    except that the data structure of Mat_SeqAIJ is slightly different.
5724    Not a good example of code reuse.
5725 */
5726 #undef __FUNCT__
5727 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5728 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5729 {
5730   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5731   IS             isicol;
5732   PetscErrorCode ierr;
5733   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5734   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5735   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5736   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5737   PetscTruth     col_identity,row_identity,both_identity,flg;
5738   PetscReal      f;
5739 
5740   PetscFunctionBegin;
5741   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5742   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5743 
5744   f             = info->fill;
5745   levels        = (PetscInt)info->levels;
5746   diagonal_fill = (PetscInt)info->diagonal_fill;
5747   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5748 
5749   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5750   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5751   both_identity = (PetscTruth) (row_identity && col_identity);
5752 
5753   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5754     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5755     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5756 
5757     fact->factor = MAT_FACTOR_ILU;
5758     b            = (Mat_SeqBAIJ*)fact->data;
5759     b->row       = isrow;
5760     b->col       = iscol;
5761     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5762     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5763     b->icol      = isicol;
5764     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5765     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5766     PetscFunctionReturn(0);
5767   }
5768 
5769   /* general case perform the symbolic factorization */
5770     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5771     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5772 
5773     /* get new row pointers */
5774     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5775     ainew[0] = 0;
5776     /* don't know how many column pointers are needed so estimate */
5777     jmax = (PetscInt)(f*ai[n] + 1);
5778     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5779     /* ajfill is level of fill for each fill entry */
5780     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5781     /* fill is a linked list of nonzeros in active row */
5782     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5783     /* im is level for each filled value */
5784     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5785     /* dloc is location of diagonal in factor */
5786     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5787     dloc[0]  = 0;
5788     for (prow=0; prow<n; prow++) {
5789 
5790       /* copy prow into linked list */
5791       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5792       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5793       xi         = aj + ai[r[prow]];
5794       fill[n]    = n;
5795       fill[prow] = -1; /* marker for diagonal entry */
5796       while (nz--) {
5797 	fm  = n;
5798 	idx = ic[*xi++];
5799 	do {
5800 	  m  = fm;
5801 	  fm = fill[m];
5802 	} while (fm < idx);
5803 	fill[m]   = idx;
5804 	fill[idx] = fm;
5805 	im[idx]   = 0;
5806       }
5807 
5808       /* make sure diagonal entry is included */
5809       if (diagonal_fill && fill[prow] == -1) {
5810 	fm = n;
5811 	while (fill[fm] < prow) fm = fill[fm];
5812 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5813 	fill[fm]   = prow;
5814 	im[prow]   = 0;
5815 	nzf++;
5816 	dcount++;
5817       }
5818 
5819       nzi = 0;
5820       row = fill[n];
5821       while (row < prow) {
5822 	incrlev = im[row] + 1;
5823 	nz      = dloc[row];
5824 	xi      = ajnew  + ainew[row] + nz + 1;
5825 	flev    = ajfill + ainew[row] + nz + 1;
5826 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5827 	fm      = row;
5828 	while (nnz-- > 0) {
5829 	  idx = *xi++;
5830 	  if (*flev + incrlev > levels) {
5831 	    flev++;
5832 	    continue;
5833 	  }
5834 	  do {
5835 	    m  = fm;
5836 	    fm = fill[m];
5837 	  } while (fm < idx);
5838 	  if (fm != idx) {
5839 	    im[idx]   = *flev + incrlev;
5840 	    fill[m]   = idx;
5841 	    fill[idx] = fm;
5842 	    fm        = idx;
5843 	    nzf++;
5844 	  } else {
5845 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5846 	  }
5847 	  flev++;
5848 	}
5849 	row = fill[row];
5850 	nzi++;
5851       }
5852       /* copy new filled row into permanent storage */
5853       ainew[prow+1] = ainew[prow] + nzf;
5854       if (ainew[prow+1] > jmax) {
5855 
5856 	/* estimate how much additional space we will need */
5857 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5858 	/* just double the memory each time */
5859 	PetscInt maxadd = jmax;
5860 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5861 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5862 	jmax += maxadd;
5863 
5864 	/* allocate a longer ajnew and ajfill */
5865 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5866 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5867 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5868 	ajnew = xitmp;
5869 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5870 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5871 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5872 	ajfill = xitmp;
5873 	reallocate++; /* count how many reallocations are needed */
5874       }
5875       xitmp       = ajnew + ainew[prow];
5876       flev        = ajfill + ainew[prow];
5877       dloc[prow]  = nzi;
5878       fm          = fill[n];
5879       while (nzf--) {
5880 	*xitmp++ = fm;
5881 	*flev++ = im[fm];
5882 	fm      = fill[fm];
5883       }
5884       /* make sure row has diagonal entry */
5885       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5886 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5887     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5888       }
5889     }
5890     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5891     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5892     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5893     ierr = PetscFree(fill);CHKERRQ(ierr);
5894     ierr = PetscFree(im);CHKERRQ(ierr);
5895 
5896 #if defined(PETSC_USE_INFO)
5897     {
5898       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5899       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5900       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5901       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5902       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5903       if (diagonal_fill) {
5904 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5905       }
5906     }
5907 #endif
5908 
5909     /* put together the new matrix */
5910     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5911     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5912     b    = (Mat_SeqBAIJ*)fact->data;
5913     b->free_a       = PETSC_TRUE;
5914     b->free_ij      = PETSC_TRUE;
5915     b->singlemalloc = PETSC_FALSE;
5916     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5917     b->j          = ajnew;
5918     b->i          = ainew;
5919     for (i=0; i<n; i++) dloc[i] += ainew[i];
5920     b->diag       = dloc;
5921     b->free_diag  = PETSC_TRUE;
5922     b->ilen       = 0;
5923     b->imax       = 0;
5924     b->row        = isrow;
5925     b->col        = iscol;
5926     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5927     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5928     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5929     b->icol       = isicol;
5930     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5931     /* In b structure:  Free imax, ilen, old a, old j.
5932        Allocate dloc, solve_work, new a, new j */
5933     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5934     b->maxnz          = b->nz = ainew[n];
5935 
5936     fact->info.factor_mallocs    = reallocate;
5937     fact->info.fill_ratio_given  = f;
5938     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5939 
5940   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5941   PetscFunctionReturn(0);
5942 }
5943 
5944 #undef __FUNCT__
5945 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5946 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5947 {
5948   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5949   /* int i,*AJ=a->j,nz=a->nz; */
5950   PetscFunctionBegin;
5951   /* Undo Column scaling */
5952 /*    while (nz--) { */
5953 /*      AJ[i] = AJ[i]/4; */
5954 /*    } */
5955   /* This should really invoke a push/pop logic, but we don't have that yet. */
5956   A->ops->setunfactored = PETSC_NULL;
5957   PetscFunctionReturn(0);
5958 }
5959 
5960 #undef __FUNCT__
5961 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5962 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5963 {
5964   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5965   PetscInt       *AJ=a->j,nz=a->nz;
5966   unsigned short *aj=(unsigned short *)AJ;
5967   PetscFunctionBegin;
5968   /* Is this really necessary? */
5969   while (nz--) {
5970     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5971   }
5972   A->ops->setunfactored = PETSC_NULL;
5973   PetscFunctionReturn(0);
5974 }
5975 
5976 
5977