xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 35e7444da1e6edf29ca585d75dc3fe3d2c63a6e4)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 
7 #include "../src/mat/impls/baij/seq/baij.h"
8 #include "../src/mat/blockinvert.h"
9 #include "petscbt.h"
10 #include "../src/mat/utils/freespace.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15 {
16   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
17   PetscErrorCode ierr;
18   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
19   PetscInt       *diag = a->diag;
20   MatScalar      *aa=a->a,*v;
21   PetscScalar    s1,*x,*b;
22 
23   PetscFunctionBegin;
24   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
25   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27 
28   /* forward solve the U^T */
29   for (i=0; i<n; i++) {
30 
31     v     = aa + diag[i];
32     /* multiply by the inverse of the block diagonal */
33     s1    = (*v++)*x[i];
34     vi    = aj + diag[i] + 1;
35     nz    = ai[i+1] - diag[i] - 1;
36     while (nz--) {
37       x[*vi++]  -= (*v++)*s1;
38     }
39     x[i]   = s1;
40   }
41   /* backward solve the L^T */
42   for (i=n-1; i>=0; i--){
43     v    = aa + diag[i] - 1;
44     vi   = aj + diag[i] - 1;
45     nz   = diag[i] - ai[i];
46     s1   = x[i];
47     while (nz--) {
48       x[*vi--]   -=  (*v--)*s1;
49     }
50   }
51   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
52   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
53   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
54   PetscFunctionReturn(0);
55 }
56 
57 #undef __FUNCT__
58 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
59 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
60 {
61   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
62   PetscErrorCode ierr;
63   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
64   PetscInt       *diag = a->diag,oidx;
65   MatScalar      *aa=a->a,*v;
66   PetscScalar    s1,s2,x1,x2;
67   PetscScalar    *x,*b;
68 
69   PetscFunctionBegin;
70   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
71   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
72   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
73 
74   /* forward solve the U^T */
75   idx = 0;
76   for (i=0; i<n; i++) {
77 
78     v     = aa + 4*diag[i];
79     /* multiply by the inverse of the block diagonal */
80     x1 = x[idx];   x2 = x[1+idx];
81     s1 = v[0]*x1  +  v[1]*x2;
82     s2 = v[2]*x1  +  v[3]*x2;
83     v += 4;
84 
85     vi    = aj + diag[i] + 1;
86     nz    = ai[i+1] - diag[i] - 1;
87     while (nz--) {
88       oidx = 2*(*vi++);
89       x[oidx]   -= v[0]*s1  +  v[1]*s2;
90       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
91       v  += 4;
92     }
93     x[idx]   = s1;x[1+idx] = s2;
94     idx += 2;
95   }
96   /* backward solve the L^T */
97   for (i=n-1; i>=0; i--){
98     v    = aa + 4*diag[i] - 4;
99     vi   = aj + diag[i] - 1;
100     nz   = diag[i] - ai[i];
101     idt  = 2*i;
102     s1   = x[idt];  s2 = x[1+idt];
103     while (nz--) {
104       idx   = 2*(*vi--);
105       x[idx]   -=  v[0]*s1 +  v[1]*s2;
106       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
107       v -= 4;
108     }
109   }
110   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
111   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
112   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
113   PetscFunctionReturn(0);
114 }
115 
116 #undef __FUNCT__
117 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
118 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
119 {
120   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
121   PetscErrorCode ierr;
122   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
123   PetscInt       nz,idx,idt,j,i,oidx;
124   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
125   MatScalar      *aa=a->a,*v;
126   PetscScalar    s1,s2,x1,x2;
127   PetscScalar    *x,*b;
128 
129   PetscFunctionBegin;
130   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133 
134   /* forward solve the U^T */
135   idx = 0;
136   for (i=0; i<n; i++) {
137     v     = aa + bs2*diag[i];
138     /* multiply by the inverse of the block diagonal */
139     x1 = x[idx];   x2 = x[1+idx];
140     s1 = v[0]*x1  +  v[1]*x2;
141     s2 = v[2]*x1  +  v[3]*x2;
142     v -= bs2;
143 
144     vi    = aj + diag[i] - 1;
145     nz    = diag[i] - diag[i+1] - 1;
146     for(j=0;j>-nz;j--){
147       oidx = bs*vi[j];
148       x[oidx]   -= v[0]*s1  +  v[1]*s2;
149       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
150       v  -= bs2;
151     }
152     x[idx]   = s1;x[1+idx] = s2;
153     idx += bs;
154   }
155   /* backward solve the L^T */
156   for (i=n-1; i>=0; i--){
157     v    = aa + bs2*ai[i];
158     vi   = aj + ai[i];
159     nz   = ai[i+1] - ai[i];
160     idt  = bs*i;
161     s1   = x[idt];  s2 = x[1+idt];
162     for(j=0;j<nz;j++){
163       idx   = bs*vi[j];
164       x[idx]   -=  v[0]*s1 +  v[1]*s2;
165       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
166       v += bs2;
167     }
168   }
169   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
170   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
171   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
177 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
178 {
179   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
180   PetscErrorCode ierr;
181   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182   PetscInt       *diag = a->diag,oidx;
183   MatScalar      *aa=a->a,*v;
184   PetscScalar    s1,s2,s3,x1,x2,x3;
185   PetscScalar    *x,*b;
186 
187   PetscFunctionBegin;
188   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191 
192   /* forward solve the U^T */
193   idx = 0;
194   for (i=0; i<n; i++) {
195 
196     v     = aa + 9*diag[i];
197     /* multiply by the inverse of the block diagonal */
198     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
199     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
200     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
201     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
202     v += 9;
203 
204     vi    = aj + diag[i] + 1;
205     nz    = ai[i+1] - diag[i] - 1;
206     while (nz--) {
207       oidx = 3*(*vi++);
208       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
209       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
210       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
211       v  += 9;
212     }
213     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
214     idx += 3;
215   }
216   /* backward solve the L^T */
217   for (i=n-1; i>=0; i--){
218     v    = aa + 9*diag[i] - 9;
219     vi   = aj + diag[i] - 1;
220     nz   = diag[i] - ai[i];
221     idt  = 3*i;
222     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
223     while (nz--) {
224       idx   = 3*(*vi--);
225       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
226       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
227       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
228       v -= 9;
229     }
230   }
231   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
232   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
233   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
234   PetscFunctionReturn(0);
235 }
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
239 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
240 {
241   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
242   PetscErrorCode ierr;
243   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
244   PetscInt       nz,idx,idt,j,i,oidx;
245   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
246   MatScalar      *aa=a->a,*v;
247   PetscScalar    s1,s2,s3,x1,x2,x3;
248   PetscScalar    *x,*b;
249 
250   PetscFunctionBegin;
251   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
252   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
253   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254 
255   /* forward solve the U^T */
256   idx = 0;
257   for (i=0; i<n; i++) {
258     v     = aa + bs2*diag[i];
259     /* multiply by the inverse of the block diagonal */
260     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
261     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
262     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
263     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
264     v -= bs2;
265 
266     vi    = aj + diag[i] - 1;
267     nz    = diag[i] - diag[i+1] - 1;
268     for(j=0;j>-nz;j--){
269       oidx = bs*vi[j];
270       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
271       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
272       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
273       v  -= bs2;
274     }
275     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
276     idx += bs;
277   }
278   /* backward solve the L^T */
279   for (i=n-1; i>=0; i--){
280     v    = aa + bs2*ai[i];
281     vi   = aj + ai[i];
282     nz   = ai[i+1] - ai[i];
283     idt  = bs*i;
284     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
285     for(j=0;j<nz;j++){
286       idx   = bs*vi[j];
287       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
288       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
289       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
290       v += bs2;
291     }
292   }
293   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
294   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
295   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 #undef __FUNCT__
300 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
301 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
302 {
303   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
304   PetscErrorCode ierr;
305   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306   PetscInt       *diag = a->diag,oidx;
307   MatScalar      *aa=a->a,*v;
308   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
309   PetscScalar    *x,*b;
310 
311   PetscFunctionBegin;
312   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
313   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
314   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
315 
316   /* forward solve the U^T */
317   idx = 0;
318   for (i=0; i<n; i++) {
319 
320     v     = aa + 16*diag[i];
321     /* multiply by the inverse of the block diagonal */
322     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
323     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
324     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
325     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
326     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
327     v += 16;
328 
329     vi    = aj + diag[i] + 1;
330     nz    = ai[i+1] - diag[i] - 1;
331     while (nz--) {
332       oidx = 4*(*vi++);
333       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
334       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
335       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
336       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
337       v  += 16;
338     }
339     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
340     idx += 4;
341   }
342   /* backward solve the L^T */
343   for (i=n-1; i>=0; i--){
344     v    = aa + 16*diag[i] - 16;
345     vi   = aj + diag[i] - 1;
346     nz   = diag[i] - ai[i];
347     idt  = 4*i;
348     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
349     while (nz--) {
350       idx   = 4*(*vi--);
351       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
352       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
353       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
354       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
355       v -= 16;
356     }
357   }
358   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
359   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
360   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
366 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
367 {
368   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
369   PetscErrorCode ierr;
370   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
371   PetscInt       nz,idx,idt,j,i,oidx;
372   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
373   MatScalar      *aa=a->a,*v;
374   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
375   PetscScalar    *x,*b;
376 
377   PetscFunctionBegin;
378   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
379   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
380   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
381 
382   /* forward solve the U^T */
383   idx = 0;
384   for (i=0; i<n; i++) {
385     v     = aa + bs2*diag[i];
386     /* multiply by the inverse of the block diagonal */
387     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
388     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
389     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
390     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
391     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
392     v -= bs2;
393 
394     vi    = aj + diag[i] - 1;
395     nz    = diag[i] - diag[i+1] - 1;
396     for(j=0;j>-nz;j--){
397       oidx = bs*vi[j];
398       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
399       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
400       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
401       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
402       v  -= bs2;
403     }
404     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
405     idx += bs;
406   }
407   /* backward solve the L^T */
408   for (i=n-1; i>=0; i--){
409     v    = aa + bs2*ai[i];
410     vi   = aj + ai[i];
411     nz   = ai[i+1] - ai[i];
412     idt  = bs*i;
413     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
414     for(j=0;j<nz;j++){
415       idx   = bs*vi[j];
416       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
417       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
418       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
419       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
420       v += bs2;
421     }
422   }
423   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
424   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
425   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
426   PetscFunctionReturn(0);
427 }
428 
429 #undef __FUNCT__
430 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
431 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
432 {
433   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
434   PetscErrorCode ierr;
435   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
436   PetscInt       *diag = a->diag,oidx;
437   MatScalar      *aa=a->a,*v;
438   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
439   PetscScalar    *x,*b;
440 
441   PetscFunctionBegin;
442   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
443   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
444   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
445 
446   /* forward solve the U^T */
447   idx = 0;
448   for (i=0; i<n; i++) {
449 
450     v     = aa + 25*diag[i];
451     /* multiply by the inverse of the block diagonal */
452     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
453     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
454     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
455     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
456     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
457     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
458     v += 25;
459 
460     vi    = aj + diag[i] + 1;
461     nz    = ai[i+1] - diag[i] - 1;
462     while (nz--) {
463       oidx = 5*(*vi++);
464       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
465       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
466       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
467       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
468       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
469       v  += 25;
470     }
471     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
472     idx += 5;
473   }
474   /* backward solve the L^T */
475   for (i=n-1; i>=0; i--){
476     v    = aa + 25*diag[i] - 25;
477     vi   = aj + diag[i] - 1;
478     nz   = diag[i] - ai[i];
479     idt  = 5*i;
480     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
481     while (nz--) {
482       idx   = 5*(*vi--);
483       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
484       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
485       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
486       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
487       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
488       v -= 25;
489     }
490   }
491   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
492   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
493   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
494   PetscFunctionReturn(0);
495 }
496 
497 #undef __FUNCT__
498 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
499 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
500 {
501   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
502   PetscErrorCode ierr;
503   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
504   PetscInt       nz,idx,idt,j,i,oidx;
505   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
506   MatScalar      *aa=a->a,*v;
507   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
508   PetscScalar    *x,*b;
509 
510   PetscFunctionBegin;
511   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
512   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
513   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
514 
515   /* forward solve the U^T */
516   idx = 0;
517   for (i=0; i<n; i++) {
518     v     = aa + bs2*diag[i];
519     /* multiply by the inverse of the block diagonal */
520     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
521     x5 = x[4+idx];
522     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
523     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
524     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
525     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
526     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
527     v -= bs2;
528 
529     vi    = aj + diag[i] - 1;
530     nz    = diag[i] - diag[i+1] - 1;
531     for(j=0;j>-nz;j--){
532       oidx = bs*vi[j];
533       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
534       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
535       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
536       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
537       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
538       v  -= bs2;
539     }
540     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
541     idx += bs;
542   }
543   /* backward solve the L^T */
544   for (i=n-1; i>=0; i--){
545     v    = aa + bs2*ai[i];
546     vi   = aj + ai[i];
547     nz   = ai[i+1] - ai[i];
548     idt  = bs*i;
549     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
550     for(j=0;j<nz;j++){
551       idx   = bs*vi[j];
552       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
553       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
554       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
555       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
556       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
557       v += bs2;
558     }
559   }
560   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
561   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
562   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
563   PetscFunctionReturn(0);
564 }
565 
566 #undef __FUNCT__
567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
568 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
569 {
570   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
571   PetscErrorCode ierr;
572   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
573   PetscInt       *diag = a->diag,oidx;
574   MatScalar      *aa=a->a,*v;
575   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
576   PetscScalar    *x,*b;
577 
578   PetscFunctionBegin;
579   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
580   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
581   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
582 
583   /* forward solve the U^T */
584   idx = 0;
585   for (i=0; i<n; i++) {
586 
587     v     = aa + 36*diag[i];
588     /* multiply by the inverse of the block diagonal */
589     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
590     x6    = x[5+idx];
591     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
592     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
593     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
594     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
595     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
596     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
597     v += 36;
598 
599     vi    = aj + diag[i] + 1;
600     nz    = ai[i+1] - diag[i] - 1;
601     while (nz--) {
602       oidx = 6*(*vi++);
603       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
604       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
605       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
606       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
607       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
608       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
609       v  += 36;
610     }
611     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
612     x[5+idx] = s6;
613     idx += 6;
614   }
615   /* backward solve the L^T */
616   for (i=n-1; i>=0; i--){
617     v    = aa + 36*diag[i] - 36;
618     vi   = aj + diag[i] - 1;
619     nz   = diag[i] - ai[i];
620     idt  = 6*i;
621     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
622     s6 = x[5+idt];
623     while (nz--) {
624       idx   = 6*(*vi--);
625       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
626       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
627       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
628       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
629       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
630       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
631       v -= 36;
632     }
633   }
634   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
635   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
636   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
637   PetscFunctionReturn(0);
638 }
639 
640 #undef __FUNCT__
641 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
642 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
643 {
644   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
645   PetscErrorCode ierr;
646   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
647   PetscInt       nz,idx,idt,j,i,oidx;
648   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
649   MatScalar      *aa=a->a,*v;
650   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
651   PetscScalar    *x,*b;
652 
653   PetscFunctionBegin;
654   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
655   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
656   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
657 
658   /* forward solve the U^T */
659   idx = 0;
660   for (i=0; i<n; i++) {
661     v     = aa + bs2*diag[i];
662     /* multiply by the inverse of the block diagonal */
663     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
664     x5 = x[4+idx]; x6 = x[5+idx];
665     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
666     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
667     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
668     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
669     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
670     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
671     v -= bs2;
672 
673     vi    = aj + diag[i] - 1;
674     nz    = diag[i] - diag[i+1] - 1;
675     for(j=0;j>-nz;j--){
676       oidx = bs*vi[j];
677       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
678       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
679       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
680       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
681       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
682       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
683       v  -= bs2;
684     }
685     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
686     x[5+idx] = s6;
687     idx += bs;
688   }
689   /* backward solve the L^T */
690   for (i=n-1; i>=0; i--){
691     v    = aa + bs2*ai[i];
692     vi   = aj + ai[i];
693     nz   = ai[i+1] - ai[i];
694     idt  = bs*i;
695     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
696     s6   = x[5+idt];
697     for(j=0;j<nz;j++){
698       idx   = bs*vi[j];
699       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
700       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
701       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
702       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
703       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
704       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
705       v += bs2;
706     }
707   }
708   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
710   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 
714 #undef __FUNCT__
715 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
716 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
717 {
718   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
719   PetscErrorCode ierr;
720   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
721   PetscInt       *diag = a->diag,oidx;
722   MatScalar      *aa=a->a,*v;
723   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
724   PetscScalar    *x,*b;
725 
726   PetscFunctionBegin;
727   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
728   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
729   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
730 
731   /* forward solve the U^T */
732   idx = 0;
733   for (i=0; i<n; i++) {
734 
735     v     = aa + 49*diag[i];
736     /* multiply by the inverse of the block diagonal */
737     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
738     x6    = x[5+idx]; x7 = x[6+idx];
739     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
740     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
741     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
742     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
743     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
744     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
745     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
746     v += 49;
747 
748     vi    = aj + diag[i] + 1;
749     nz    = ai[i+1] - diag[i] - 1;
750     while (nz--) {
751       oidx = 7*(*vi++);
752       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
753       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
754       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
755       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
756       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
757       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
758       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
759       v  += 49;
760     }
761     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
762     x[5+idx] = s6;x[6+idx] = s7;
763     idx += 7;
764   }
765   /* backward solve the L^T */
766   for (i=n-1; i>=0; i--){
767     v    = aa + 49*diag[i] - 49;
768     vi   = aj + diag[i] - 1;
769     nz   = diag[i] - ai[i];
770     idt  = 7*i;
771     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
772     s6 = x[5+idt];s7 = x[6+idt];
773     while (nz--) {
774       idx   = 7*(*vi--);
775       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
776       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
777       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
778       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
779       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
780       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
781       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
782       v -= 49;
783     }
784   }
785   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 #undef __FUNCT__
791 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
792 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
793 {
794   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
795   PetscErrorCode ierr;
796   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
797   PetscInt       nz,idx,idt,j,i,oidx;
798   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
799   MatScalar      *aa=a->a,*v;
800   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
801   PetscScalar    *x,*b;
802 
803   PetscFunctionBegin;
804   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
805   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
807 
808   /* forward solve the U^T */
809   idx = 0;
810   for (i=0; i<n; i++) {
811     v     = aa + bs2*diag[i];
812     /* multiply by the inverse of the block diagonal */
813     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
814     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
815     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
816     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
817     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
818     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
819     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
820     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
821     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
822     v -= bs2;
823     vi    = aj + diag[i] - 1;
824     nz    = diag[i] - diag[i+1] - 1;
825     for(j=0;j>-nz;j--){
826       oidx = bs*vi[j];
827       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
828       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
829       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
830       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
831       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
832       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
833       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
834       v  -= bs2;
835     }
836     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
837     x[5+idx] = s6;  x[6+idx] = s7;
838     idx += bs;
839   }
840   /* backward solve the L^T */
841   for (i=n-1; i>=0; i--){
842     v    = aa + bs2*ai[i];
843     vi   = aj + ai[i];
844     nz   = ai[i+1] - ai[i];
845     idt  = bs*i;
846     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
847     s6   = x[5+idt];  s7 = x[6+idt];
848     for(j=0;j<nz;j++){
849       idx   = bs*vi[j];
850       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
851       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
852       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
853       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
854       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
855       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
856       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
857       v += bs2;
858     }
859   }
860   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
861   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
862   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 /*---------------------------------------------------------------------------------------------*/
867 #undef __FUNCT__
868 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
869 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
870 {
871   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
872   IS             iscol=a->col,isrow=a->row;
873   PetscErrorCode ierr;
874   const PetscInt *r,*c,*rout,*cout;
875   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
876   PetscInt       *diag = a->diag;
877   MatScalar      *aa=a->a,*v;
878   PetscScalar    s1,*x,*b,*t;
879 
880   PetscFunctionBegin;
881   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
882   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
883   t  = a->solve_work;
884 
885   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
886   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
887 
888   /* copy the b into temp work space according to permutation */
889   for (i=0; i<n; i++) {
890     t[i] = b[c[i]];
891   }
892 
893   /* forward solve the U^T */
894   for (i=0; i<n; i++) {
895 
896     v     = aa + diag[i];
897     /* multiply by the inverse of the block diagonal */
898     s1    = (*v++)*t[i];
899     vi    = aj + diag[i] + 1;
900     nz    = ai[i+1] - diag[i] - 1;
901     while (nz--) {
902       t[*vi++]  -= (*v++)*s1;
903     }
904     t[i]   = s1;
905   }
906   /* backward solve the L^T */
907   for (i=n-1; i>=0; i--){
908     v    = aa + diag[i] - 1;
909     vi   = aj + diag[i] - 1;
910     nz   = diag[i] - ai[i];
911     s1   = t[i];
912     while (nz--) {
913       t[*vi--]   -=  (*v--)*s1;
914     }
915   }
916 
917   /* copy t into x according to permutation */
918   for (i=0; i<n; i++) {
919     x[r[i]]   = t[i];
920   }
921 
922   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
923   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
924   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
925   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
926   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
927   PetscFunctionReturn(0);
928 }
929 
930 #undef __FUNCT__
931 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
932 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
933 {
934   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
935   IS             iscol=a->col,isrow=a->row;
936   PetscErrorCode ierr;
937   const PetscInt *r,*c,*rout,*cout;
938   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
939   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
940   MatScalar      *aa=a->a,*v;
941   PetscScalar    s1,s2,x1,x2;
942   PetscScalar    *x,*b,*t;
943 
944   PetscFunctionBegin;
945   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
946   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
947   t  = a->solve_work;
948 
949   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
950   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
951 
952   /* copy the b into temp work space according to permutation */
953   ii = 0;
954   for (i=0; i<n; i++) {
955     ic      = 2*c[i];
956     t[ii]   = b[ic];
957     t[ii+1] = b[ic+1];
958     ii += 2;
959   }
960 
961   /* forward solve the U^T */
962   idx = 0;
963   for (i=0; i<n; i++) {
964 
965     v     = aa + 4*diag[i];
966     /* multiply by the inverse of the block diagonal */
967     x1    = t[idx];   x2 = t[1+idx];
968     s1 = v[0]*x1  +  v[1]*x2;
969     s2 = v[2]*x1  +  v[3]*x2;
970     v += 4;
971 
972     vi    = aj + diag[i] + 1;
973     nz    = ai[i+1] - diag[i] - 1;
974     while (nz--) {
975       oidx = 2*(*vi++);
976       t[oidx]   -= v[0]*s1  +  v[1]*s2;
977       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
978       v  += 4;
979     }
980     t[idx]   = s1;t[1+idx] = s2;
981     idx += 2;
982   }
983   /* backward solve the L^T */
984   for (i=n-1; i>=0; i--){
985     v    = aa + 4*diag[i] - 4;
986     vi   = aj + diag[i] - 1;
987     nz   = diag[i] - ai[i];
988     idt  = 2*i;
989     s1 = t[idt];  s2 = t[1+idt];
990     while (nz--) {
991       idx   = 2*(*vi--);
992       t[idx]   -=  v[0]*s1 +  v[1]*s2;
993       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
994       v -= 4;
995     }
996   }
997 
998   /* copy t into x according to permutation */
999   ii = 0;
1000   for (i=0; i<n; i++) {
1001     ir      = 2*r[i];
1002     x[ir]   = t[ii];
1003     x[ir+1] = t[ii+1];
1004     ii += 2;
1005   }
1006 
1007   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1008   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1009   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1011   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 #undef __FUNCT__
1016 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1017 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1018 {
1019   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1020   PetscErrorCode ierr;
1021   IS             iscol=a->col,isrow=a->row;
1022   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1023   const PetscInt *r,*c,*rout,*cout;
1024   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1025   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1026   MatScalar      *aa=a->a,*v;
1027   PetscScalar    s1,s2,x1,x2;
1028   PetscScalar    *x,*b,*t;
1029 
1030   PetscFunctionBegin;
1031   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1032   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1033   t = a->solve_work;
1034 
1035   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1036   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1037 
1038   /* copy b into temp work space according to permutation */
1039   for(i=0;i<n;i++){
1040     ii = bs*i; ic = bs*c[i];
1041     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1042   }
1043 
1044   /* forward solve the U^T */
1045   idx = 0;
1046   for (i=0; i<n; i++) {
1047     v     = aa + bs2*diag[i];
1048     /* multiply by the inverse of the block diagonal */
1049     x1 = t[idx];   x2 = t[1+idx];
1050     s1 = v[0]*x1  +  v[1]*x2;
1051     s2 = v[2]*x1  +  v[3]*x2;
1052     v -= bs2;
1053 
1054     vi    = aj + diag[i] - 1;
1055     nz    = diag[i] - diag[i+1] - 1;
1056     for(j=0;j>-nz;j--){
1057       oidx = bs*vi[j];
1058       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1059       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1060       v  -= bs2;
1061     }
1062     t[idx]   = s1;t[1+idx] = s2;
1063     idx += bs;
1064   }
1065   /* backward solve the L^T */
1066   for (i=n-1; i>=0; i--){
1067     v    = aa + bs2*ai[i];
1068     vi   = aj + ai[i];
1069     nz   = ai[i+1] - ai[i];
1070     idt  = bs*i;
1071     s1   = t[idt];  s2 = t[1+idt];
1072     for(j=0;j<nz;j++){
1073       idx   = bs*vi[j];
1074       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1075       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1076       v += bs2;
1077     }
1078   }
1079 
1080   /* copy t into x according to permutation */
1081   for(i=0;i<n;i++){
1082     ii = bs*i;  ir = bs*r[i];
1083     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1084   }
1085 
1086   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1087   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1088   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1090   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 #undef __FUNCT__
1095 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1096 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1097 {
1098   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1099   IS             iscol=a->col,isrow=a->row;
1100   PetscErrorCode ierr;
1101   const PetscInt *r,*c,*rout,*cout;
1102   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1103   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1104   MatScalar      *aa=a->a,*v;
1105   PetscScalar    s1,s2,s3,x1,x2,x3;
1106   PetscScalar    *x,*b,*t;
1107 
1108   PetscFunctionBegin;
1109   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1110   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1111   t  = a->solve_work;
1112 
1113   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1114   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1115 
1116   /* copy the b into temp work space according to permutation */
1117   ii = 0;
1118   for (i=0; i<n; i++) {
1119     ic      = 3*c[i];
1120     t[ii]   = b[ic];
1121     t[ii+1] = b[ic+1];
1122     t[ii+2] = b[ic+2];
1123     ii += 3;
1124   }
1125 
1126   /* forward solve the U^T */
1127   idx = 0;
1128   for (i=0; i<n; i++) {
1129 
1130     v     = aa + 9*diag[i];
1131     /* multiply by the inverse of the block diagonal */
1132     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1133     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1134     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1135     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1136     v += 9;
1137 
1138     vi    = aj + diag[i] + 1;
1139     nz    = ai[i+1] - diag[i] - 1;
1140     while (nz--) {
1141       oidx = 3*(*vi++);
1142       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1143       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1144       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1145       v  += 9;
1146     }
1147     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1148     idx += 3;
1149   }
1150   /* backward solve the L^T */
1151   for (i=n-1; i>=0; i--){
1152     v    = aa + 9*diag[i] - 9;
1153     vi   = aj + diag[i] - 1;
1154     nz   = diag[i] - ai[i];
1155     idt  = 3*i;
1156     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1157     while (nz--) {
1158       idx   = 3*(*vi--);
1159       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1160       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1161       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1162       v -= 9;
1163     }
1164   }
1165 
1166   /* copy t into x according to permutation */
1167   ii = 0;
1168   for (i=0; i<n; i++) {
1169     ir      = 3*r[i];
1170     x[ir]   = t[ii];
1171     x[ir+1] = t[ii+1];
1172     x[ir+2] = t[ii+2];
1173     ii += 3;
1174   }
1175 
1176   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1177   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1178   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1179   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1180   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1186 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1187 {
1188   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1189   PetscErrorCode ierr;
1190   IS             iscol=a->col,isrow=a->row;
1191   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1192   const PetscInt *r,*c,*rout,*cout;
1193   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1194   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1195   MatScalar      *aa=a->a,*v;
1196   PetscScalar    s1,s2,s3,x1,x2,x3;
1197   PetscScalar    *x,*b,*t;
1198 
1199   PetscFunctionBegin;
1200   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1201   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1202   t = a->solve_work;
1203 
1204   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1205   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1206 
1207   /* copy b into temp work space according to permutation */
1208   for(i=0;i<n;i++){
1209     ii = bs*i; ic = bs*c[i];
1210     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1211   }
1212 
1213   /* forward solve the U^T */
1214   idx = 0;
1215   for (i=0; i<n; i++) {
1216     v     = aa + bs2*diag[i];
1217     /* multiply by the inverse of the block diagonal */
1218     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1219     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1220     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1221     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1222     v -= bs2;
1223 
1224     vi    = aj + diag[i] - 1;
1225     nz    = diag[i] - diag[i+1] - 1;
1226     for(j=0;j>-nz;j--){
1227       oidx = bs*vi[j];
1228       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1229       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1230       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1231       v  -= bs2;
1232     }
1233     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1234     idx += bs;
1235   }
1236   /* backward solve the L^T */
1237   for (i=n-1; i>=0; i--){
1238     v    = aa + bs2*ai[i];
1239     vi   = aj + ai[i];
1240     nz   = ai[i+1] - ai[i];
1241     idt  = bs*i;
1242     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1243     for(j=0;j<nz;j++){
1244       idx   = bs*vi[j];
1245       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1246       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1247       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1248       v += bs2;
1249     }
1250   }
1251 
1252   /* copy t into x according to permutation */
1253   for(i=0;i<n;i++){
1254     ii = bs*i;  ir = bs*r[i];
1255     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1256   }
1257 
1258   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1259   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1260   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1261   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1262   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1263   PetscFunctionReturn(0);
1264 }
1265 
1266 #undef __FUNCT__
1267 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1268 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1269 {
1270   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1271   IS             iscol=a->col,isrow=a->row;
1272   PetscErrorCode ierr;
1273   const PetscInt *r,*c,*rout,*cout;
1274   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1275   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1276   MatScalar      *aa=a->a,*v;
1277   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1278   PetscScalar    *x,*b,*t;
1279 
1280   PetscFunctionBegin;
1281   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1282   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1283   t  = a->solve_work;
1284 
1285   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1286   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1287 
1288   /* copy the b into temp work space according to permutation */
1289   ii = 0;
1290   for (i=0; i<n; i++) {
1291     ic      = 4*c[i];
1292     t[ii]   = b[ic];
1293     t[ii+1] = b[ic+1];
1294     t[ii+2] = b[ic+2];
1295     t[ii+3] = b[ic+3];
1296     ii += 4;
1297   }
1298 
1299   /* forward solve the U^T */
1300   idx = 0;
1301   for (i=0; i<n; i++) {
1302 
1303     v     = aa + 16*diag[i];
1304     /* multiply by the inverse of the block diagonal */
1305     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1306     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1307     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1308     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1309     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1310     v += 16;
1311 
1312     vi    = aj + diag[i] + 1;
1313     nz    = ai[i+1] - diag[i] - 1;
1314     while (nz--) {
1315       oidx = 4*(*vi++);
1316       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1317       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1318       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1319       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1320       v  += 16;
1321     }
1322     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1323     idx += 4;
1324   }
1325   /* backward solve the L^T */
1326   for (i=n-1; i>=0; i--){
1327     v    = aa + 16*diag[i] - 16;
1328     vi   = aj + diag[i] - 1;
1329     nz   = diag[i] - ai[i];
1330     idt  = 4*i;
1331     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1332     while (nz--) {
1333       idx   = 4*(*vi--);
1334       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1335       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1336       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1337       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1338       v -= 16;
1339     }
1340   }
1341 
1342   /* copy t into x according to permutation */
1343   ii = 0;
1344   for (i=0; i<n; i++) {
1345     ir      = 4*r[i];
1346     x[ir]   = t[ii];
1347     x[ir+1] = t[ii+1];
1348     x[ir+2] = t[ii+2];
1349     x[ir+3] = t[ii+3];
1350     ii += 4;
1351   }
1352 
1353   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1354   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1355   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1356   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1357   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1358   PetscFunctionReturn(0);
1359 }
1360 
1361 #undef __FUNCT__
1362 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1363 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1364 {
1365   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1366   PetscErrorCode ierr;
1367   IS             iscol=a->col,isrow=a->row;
1368   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1369   const PetscInt *r,*c,*rout,*cout;
1370   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1371   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1372   MatScalar      *aa=a->a,*v;
1373   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1374   PetscScalar    *x,*b,*t;
1375 
1376   PetscFunctionBegin;
1377   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1378   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1379   t = a->solve_work;
1380 
1381   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1382   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1383 
1384   /* copy b into temp work space according to permutation */
1385   for(i=0;i<n;i++){
1386     ii = bs*i; ic = bs*c[i];
1387     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1388   }
1389 
1390   /* forward solve the U^T */
1391   idx = 0;
1392   for (i=0; i<n; i++) {
1393     v     = aa + bs2*diag[i];
1394     /* multiply by the inverse of the block diagonal */
1395     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1396     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1397     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1398     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1399     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1400     v -= bs2;
1401 
1402     vi    = aj + diag[i] - 1;
1403     nz    = diag[i] - diag[i+1] - 1;
1404     for(j=0;j>-nz;j--){
1405       oidx = bs*vi[j];
1406       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1407       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1408       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1409       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1410       v  -= bs2;
1411     }
1412     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1413     idx += bs;
1414   }
1415   /* backward solve the L^T */
1416   for (i=n-1; i>=0; i--){
1417     v    = aa + bs2*ai[i];
1418     vi   = aj + ai[i];
1419     nz   = ai[i+1] - ai[i];
1420     idt  = bs*i;
1421     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1422     for(j=0;j<nz;j++){
1423       idx   = bs*vi[j];
1424       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1425       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1426       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1427       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1428       v += bs2;
1429     }
1430   }
1431 
1432   /* copy t into x according to permutation */
1433   for(i=0;i<n;i++){
1434     ii = bs*i;  ir = bs*r[i];
1435     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1436   }
1437 
1438   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1439   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1440   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1442   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 #undef __FUNCT__
1447 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1448 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1449 {
1450   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1451   IS             iscol=a->col,isrow=a->row;
1452   PetscErrorCode ierr;
1453   const PetscInt *r,*c,*rout,*cout;
1454   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1455   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1456   MatScalar      *aa=a->a,*v;
1457   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1458   PetscScalar    *x,*b,*t;
1459 
1460   PetscFunctionBegin;
1461   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1462   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1463   t  = a->solve_work;
1464 
1465   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1466   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1467 
1468   /* copy the b into temp work space according to permutation */
1469   ii = 0;
1470   for (i=0; i<n; i++) {
1471     ic      = 5*c[i];
1472     t[ii]   = b[ic];
1473     t[ii+1] = b[ic+1];
1474     t[ii+2] = b[ic+2];
1475     t[ii+3] = b[ic+3];
1476     t[ii+4] = b[ic+4];
1477     ii += 5;
1478   }
1479 
1480   /* forward solve the U^T */
1481   idx = 0;
1482   for (i=0; i<n; i++) {
1483 
1484     v     = aa + 25*diag[i];
1485     /* multiply by the inverse of the block diagonal */
1486     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1487     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1488     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1489     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1490     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1491     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1492     v += 25;
1493 
1494     vi    = aj + diag[i] + 1;
1495     nz    = ai[i+1] - diag[i] - 1;
1496     while (nz--) {
1497       oidx = 5*(*vi++);
1498       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1499       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1500       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1501       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1502       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1503       v  += 25;
1504     }
1505     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1506     idx += 5;
1507   }
1508   /* backward solve the L^T */
1509   for (i=n-1; i>=0; i--){
1510     v    = aa + 25*diag[i] - 25;
1511     vi   = aj + diag[i] - 1;
1512     nz   = diag[i] - ai[i];
1513     idt  = 5*i;
1514     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1515     while (nz--) {
1516       idx   = 5*(*vi--);
1517       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1518       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1519       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1520       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1521       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1522       v -= 25;
1523     }
1524   }
1525 
1526   /* copy t into x according to permutation */
1527   ii = 0;
1528   for (i=0; i<n; i++) {
1529     ir      = 5*r[i];
1530     x[ir]   = t[ii];
1531     x[ir+1] = t[ii+1];
1532     x[ir+2] = t[ii+2];
1533     x[ir+3] = t[ii+3];
1534     x[ir+4] = t[ii+4];
1535     ii += 5;
1536   }
1537 
1538   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1539   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1540   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1541   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1542   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1543   PetscFunctionReturn(0);
1544 }
1545 
1546 #undef __FUNCT__
1547 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1548 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1549 {
1550   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1551   PetscErrorCode ierr;
1552   IS             iscol=a->col,isrow=a->row;
1553   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1554   const PetscInt *r,*c,*rout,*cout;
1555   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1556   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1557   MatScalar      *aa=a->a,*v;
1558   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1559   PetscScalar    *x,*b,*t;
1560 
1561   PetscFunctionBegin;
1562   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1563   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1564   t = a->solve_work;
1565 
1566   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1567   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1568 
1569   /* copy b into temp work space according to permutation */
1570   for(i=0;i<n;i++){
1571     ii = bs*i; ic = bs*c[i];
1572     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1573     t[ii+4] = b[ic+4];
1574   }
1575 
1576   /* forward solve the U^T */
1577   idx = 0;
1578   for (i=0; i<n; i++) {
1579     v     = aa + bs2*diag[i];
1580     /* multiply by the inverse of the block diagonal */
1581     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1582     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1583     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1584     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1585     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1586     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1587     v -= bs2;
1588 
1589     vi    = aj + diag[i] - 1;
1590     nz    = diag[i] - diag[i+1] - 1;
1591     for(j=0;j>-nz;j--){
1592       oidx = bs*vi[j];
1593       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1594       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1595       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1596       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1597       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1598       v  -= bs2;
1599     }
1600     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1601     idx += bs;
1602   }
1603   /* backward solve the L^T */
1604   for (i=n-1; i>=0; i--){
1605     v    = aa + bs2*ai[i];
1606     vi   = aj + ai[i];
1607     nz   = ai[i+1] - ai[i];
1608     idt  = bs*i;
1609     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1610     for(j=0;j<nz;j++){
1611       idx   = bs*vi[j];
1612       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1613       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1614       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1615       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1616       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1617       v += bs2;
1618     }
1619   }
1620 
1621   /* copy t into x according to permutation */
1622   for(i=0;i<n;i++){
1623     ii = bs*i;  ir = bs*r[i];
1624     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1625     x[ir+4] = t[ii+4];
1626   }
1627 
1628   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1629   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1630   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1632   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 #undef __FUNCT__
1637 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1638 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1639 {
1640   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1641   IS             iscol=a->col,isrow=a->row;
1642   PetscErrorCode ierr;
1643   const PetscInt *r,*c,*rout,*cout;
1644   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1645   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1646   MatScalar      *aa=a->a,*v;
1647   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1648   PetscScalar    *x,*b,*t;
1649 
1650   PetscFunctionBegin;
1651   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1652   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1653   t  = a->solve_work;
1654 
1655   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1656   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1657 
1658   /* copy the b into temp work space according to permutation */
1659   ii = 0;
1660   for (i=0; i<n; i++) {
1661     ic      = 6*c[i];
1662     t[ii]   = b[ic];
1663     t[ii+1] = b[ic+1];
1664     t[ii+2] = b[ic+2];
1665     t[ii+3] = b[ic+3];
1666     t[ii+4] = b[ic+4];
1667     t[ii+5] = b[ic+5];
1668     ii += 6;
1669   }
1670 
1671   /* forward solve the U^T */
1672   idx = 0;
1673   for (i=0; i<n; i++) {
1674 
1675     v     = aa + 36*diag[i];
1676     /* multiply by the inverse of the block diagonal */
1677     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1678     x6    = t[5+idx];
1679     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1680     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1681     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1682     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1683     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1684     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1685     v += 36;
1686 
1687     vi    = aj + diag[i] + 1;
1688     nz    = ai[i+1] - diag[i] - 1;
1689     while (nz--) {
1690       oidx = 6*(*vi++);
1691       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1692       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1693       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1694       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1695       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1696       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1697       v  += 36;
1698     }
1699     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1700     t[5+idx] = s6;
1701     idx += 6;
1702   }
1703   /* backward solve the L^T */
1704   for (i=n-1; i>=0; i--){
1705     v    = aa + 36*diag[i] - 36;
1706     vi   = aj + diag[i] - 1;
1707     nz   = diag[i] - ai[i];
1708     idt  = 6*i;
1709     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1710     s6 = t[5+idt];
1711     while (nz--) {
1712       idx   = 6*(*vi--);
1713       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1714       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1715       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1716       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1717       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1718       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1719       v -= 36;
1720     }
1721   }
1722 
1723   /* copy t into x according to permutation */
1724   ii = 0;
1725   for (i=0; i<n; i++) {
1726     ir      = 6*r[i];
1727     x[ir]   = t[ii];
1728     x[ir+1] = t[ii+1];
1729     x[ir+2] = t[ii+2];
1730     x[ir+3] = t[ii+3];
1731     x[ir+4] = t[ii+4];
1732     x[ir+5] = t[ii+5];
1733     ii += 6;
1734   }
1735 
1736   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1737   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1738   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1740   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 #undef __FUNCT__
1745 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1746 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1747 {
1748   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1749   PetscErrorCode ierr;
1750   IS             iscol=a->col,isrow=a->row;
1751   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1752   const PetscInt *r,*c,*rout,*cout;
1753   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1754   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1755   MatScalar      *aa=a->a,*v;
1756   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1757   PetscScalar    *x,*b,*t;
1758 
1759   PetscFunctionBegin;
1760   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1761   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1762   t = a->solve_work;
1763 
1764   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1765   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1766 
1767   /* copy b into temp work space according to permutation */
1768   for(i=0;i<n;i++){
1769     ii = bs*i; ic = bs*c[i];
1770     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1771     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1772   }
1773 
1774   /* forward solve the U^T */
1775   idx = 0;
1776   for (i=0; i<n; i++) {
1777     v     = aa + bs2*diag[i];
1778     /* multiply by the inverse of the block diagonal */
1779     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1780     x6    = t[5+idx];
1781     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1782     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1783     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1784     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1785     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1786     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1787     v -= bs2;
1788 
1789     vi    = aj + diag[i] - 1;
1790     nz    = diag[i] - diag[i+1] - 1;
1791     for(j=0;j>-nz;j--){
1792       oidx = bs*vi[j];
1793       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1794       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1795       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1796       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1797       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1798       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1799       v  -= bs2;
1800     }
1801     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1802     t[5+idx] = s6;
1803     idx += bs;
1804   }
1805   /* backward solve the L^T */
1806   for (i=n-1; i>=0; i--){
1807     v    = aa + bs2*ai[i];
1808     vi   = aj + ai[i];
1809     nz   = ai[i+1] - ai[i];
1810     idt  = bs*i;
1811     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1812     s6   = t[5+idt];
1813    for(j=0;j<nz;j++){
1814       idx   = bs*vi[j];
1815       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1816       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1817       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1818       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1819       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1820       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1821       v += bs2;
1822     }
1823   }
1824 
1825   /* copy t into x according to permutation */
1826   for(i=0;i<n;i++){
1827     ii = bs*i;  ir = bs*r[i];
1828     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1829     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1830   }
1831 
1832   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1833   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1834   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1835   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1836   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1837   PetscFunctionReturn(0);
1838 }
1839 
1840 #undef __FUNCT__
1841 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1842 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1843 {
1844   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1845   IS             iscol=a->col,isrow=a->row;
1846   PetscErrorCode ierr;
1847   const PetscInt *r,*c,*rout,*cout;
1848   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1849   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1850   MatScalar      *aa=a->a,*v;
1851   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1852   PetscScalar    *x,*b,*t;
1853 
1854   PetscFunctionBegin;
1855   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1856   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1857   t  = a->solve_work;
1858 
1859   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1860   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1861 
1862   /* copy the b into temp work space according to permutation */
1863   ii = 0;
1864   for (i=0; i<n; i++) {
1865     ic      = 7*c[i];
1866     t[ii]   = b[ic];
1867     t[ii+1] = b[ic+1];
1868     t[ii+2] = b[ic+2];
1869     t[ii+3] = b[ic+3];
1870     t[ii+4] = b[ic+4];
1871     t[ii+5] = b[ic+5];
1872     t[ii+6] = b[ic+6];
1873     ii += 7;
1874   }
1875 
1876   /* forward solve the U^T */
1877   idx = 0;
1878   for (i=0; i<n; i++) {
1879 
1880     v     = aa + 49*diag[i];
1881     /* multiply by the inverse of the block diagonal */
1882     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1883     x6    = t[5+idx]; x7 = t[6+idx];
1884     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1885     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1886     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1887     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1888     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1889     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1890     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1891     v += 49;
1892 
1893     vi    = aj + diag[i] + 1;
1894     nz    = ai[i+1] - diag[i] - 1;
1895     while (nz--) {
1896       oidx = 7*(*vi++);
1897       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1898       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1899       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1900       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1901       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1902       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1903       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1904       v  += 49;
1905     }
1906     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1907     t[5+idx] = s6;t[6+idx] = s7;
1908     idx += 7;
1909   }
1910   /* backward solve the L^T */
1911   for (i=n-1; i>=0; i--){
1912     v    = aa + 49*diag[i] - 49;
1913     vi   = aj + diag[i] - 1;
1914     nz   = diag[i] - ai[i];
1915     idt  = 7*i;
1916     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1917     s6 = t[5+idt];s7 = t[6+idt];
1918     while (nz--) {
1919       idx   = 7*(*vi--);
1920       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1921       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1922       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1923       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1924       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1925       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1926       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1927       v -= 49;
1928     }
1929   }
1930 
1931   /* copy t into x according to permutation */
1932   ii = 0;
1933   for (i=0; i<n; i++) {
1934     ir      = 7*r[i];
1935     x[ir]   = t[ii];
1936     x[ir+1] = t[ii+1];
1937     x[ir+2] = t[ii+2];
1938     x[ir+3] = t[ii+3];
1939     x[ir+4] = t[ii+4];
1940     x[ir+5] = t[ii+5];
1941     x[ir+6] = t[ii+6];
1942     ii += 7;
1943   }
1944 
1945   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1946   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1947   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1949   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1950   PetscFunctionReturn(0);
1951 }
1952 #undef __FUNCT__
1953 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1954 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1955 {
1956   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1957   PetscErrorCode ierr;
1958   IS             iscol=a->col,isrow=a->row;
1959   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1960   const PetscInt *r,*c,*rout,*cout;
1961   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1962   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1963   MatScalar      *aa=a->a,*v;
1964   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1965   PetscScalar    *x,*b,*t;
1966 
1967   PetscFunctionBegin;
1968   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1969   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1970   t = a->solve_work;
1971 
1972   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1973   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1974 
1975   /* copy b into temp work space according to permutation */
1976   for(i=0;i<n;i++){
1977     ii = bs*i; ic = bs*c[i];
1978     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1979     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1980   }
1981 
1982   /* forward solve the U^T */
1983   idx = 0;
1984   for (i=0; i<n; i++) {
1985     v     = aa + bs2*diag[i];
1986     /* multiply by the inverse of the block diagonal */
1987     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1988     x6    = t[5+idx]; x7 = t[6+idx];
1989     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1990     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1991     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1992     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1993     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1994     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1995     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1996     v -= bs2;
1997 
1998     vi    = aj + diag[i] - 1;
1999     nz    = diag[i] - diag[i+1] - 1;
2000     for(j=0;j>-nz;j--){
2001       oidx = bs*vi[j];
2002       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2003       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2004       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2005       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2006       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2007       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2008       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2009       v  -= bs2;
2010     }
2011     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2012     t[5+idx] = s6;  t[6+idx] = s7;
2013     idx += bs;
2014   }
2015   /* backward solve the L^T */
2016   for (i=n-1; i>=0; i--){
2017     v    = aa + bs2*ai[i];
2018     vi   = aj + ai[i];
2019     nz   = ai[i+1] - ai[i];
2020     idt  = bs*i;
2021     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2022     s6   = t[5+idt];  s7 = t[6+idt];
2023    for(j=0;j<nz;j++){
2024       idx   = bs*vi[j];
2025       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2026       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2027       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2028       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2029       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2030       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2031       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2032       v += bs2;
2033     }
2034   }
2035 
2036   /* copy t into x according to permutation */
2037   for(i=0;i<n;i++){
2038     ii = bs*i;  ir = bs*r[i];
2039     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2040     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2041   }
2042 
2043   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2044   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2045   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2046   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2047   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 /* ----------------------------------------------------------- */
2052 #undef __FUNCT__
2053 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2054 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2055 {
2056   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2057   IS             iscol=a->col,isrow=a->row;
2058   PetscErrorCode ierr;
2059   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2060   PetscInt       i,n=a->mbs;
2061   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
2062   MatScalar      *aa=a->a,*v;
2063   PetscScalar    *x,*b,*s,*t,*ls;
2064 
2065   PetscFunctionBegin;
2066   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2067   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2068   t  = a->solve_work;
2069 
2070   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2071   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2072 
2073   /* forward solve the lower triangular */
2074   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2075   for (i=1; i<n; i++) {
2076     v   = aa + bs2*ai[i];
2077     vi  = aj + ai[i];
2078     nz  = a->diag[i] - ai[i];
2079     s = t + bs*i;
2080     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2081     while (nz--) {
2082       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2083       v += bs2;
2084     }
2085   }
2086   /* backward solve the upper triangular */
2087   ls = a->solve_work + A->cmap->n;
2088   for (i=n-1; i>=0; i--){
2089     v   = aa + bs2*(a->diag[i] + 1);
2090     vi  = aj + a->diag[i] + 1;
2091     nz  = ai[i+1] - a->diag[i] - 1;
2092     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2093     while (nz--) {
2094       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2095       v += bs2;
2096     }
2097     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2098     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2099   }
2100 
2101   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2102   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2103   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2104   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2105   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 /* ----------------------------------------------------------- */
2110 #undef __FUNCT__
2111 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2112 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2113 {
2114   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2115   IS                iscol=a->col,isrow=a->row;
2116   PetscErrorCode    ierr;
2117   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2118   PetscInt          i,n=a->mbs,j;
2119   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2120   const MatScalar   *aa=a->a,*v;
2121   PetscScalar       *x,*t,*ls;
2122   const PetscScalar *b;
2123   PetscFunctionBegin;
2124   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2125   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2126   t    = a->solve_work;
2127 
2128   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2129   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2130 
2131   /* copy the b into temp work space according to permutation */
2132   for (i=0; i<n; i++) {
2133     for (j=0; j<bs; j++) {
2134       t[i*bs+j] = b[c[i]*bs+j];
2135     }
2136   }
2137 
2138 
2139   /* forward solve the upper triangular transpose */
2140   ls = a->solve_work + A->cmap->n;
2141   for (i=0; i<n; i++){
2142     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2143     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2144     v   = aa + bs2*(a->diag[i] + 1);
2145     vi  = aj + a->diag[i] + 1;
2146     nz  = ai[i+1] - a->diag[i] - 1;
2147     while (nz--) {
2148       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2149       v += bs2;
2150     }
2151   }
2152 
2153   /* backward solve the lower triangular transpose */
2154   for (i=n-1; i>=0; i--) {
2155     v   = aa + bs2*ai[i];
2156     vi  = aj + ai[i];
2157     nz  = a->diag[i] - ai[i];
2158     while (nz--) {
2159       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2160       v += bs2;
2161     }
2162   }
2163 
2164   /* copy t into x according to permutation */
2165   for (i=0; i<n; i++) {
2166     for (j=0; j<bs; j++) {
2167       x[bs*r[i]+j]   = t[bs*i+j];
2168     }
2169   }
2170 
2171   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2172   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2173   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2175   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2176   PetscFunctionReturn(0);
2177 }
2178 
2179 #undef __FUNCT__
2180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2181 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2182 {
2183   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2184   IS                iscol=a->col,isrow=a->row;
2185   PetscErrorCode    ierr;
2186   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2187   PetscInt          i,n=a->mbs,j;
2188   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2189   const MatScalar   *aa=a->a,*v;
2190   PetscScalar       *x,*t,*ls;
2191   const PetscScalar *b;
2192   PetscFunctionBegin;
2193   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2195   t    = a->solve_work;
2196 
2197   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2198   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2199 
2200   /* copy the b into temp work space according to permutation */
2201   for (i=0; i<n; i++) {
2202     for (j=0; j<bs; j++) {
2203       t[i*bs+j] = b[c[i]*bs+j];
2204     }
2205   }
2206 
2207 
2208   /* forward solve the upper triangular transpose */
2209   ls = a->solve_work + A->cmap->n;
2210   for (i=0; i<n; i++){
2211     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2212     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2213     v   = aa + bs2*(diag[i] - 1);
2214     vi  = aj + diag[i] - 1;
2215     nz  = diag[i] - diag[i+1] - 1;
2216     for(j=0;j>-nz;j--){
2217       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2218       v -= bs2;
2219     }
2220   }
2221 
2222   /* backward solve the lower triangular transpose */
2223   for (i=n-1; i>=0; i--) {
2224     v   = aa + bs2*ai[i];
2225     vi  = aj + ai[i];
2226     nz  = ai[i+1] - ai[i];
2227     for(j=0;j<nz;j++){
2228       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2229       v += bs2;
2230     }
2231   }
2232 
2233   /* copy t into x according to permutation */
2234   for (i=0; i<n; i++) {
2235     for (j=0; j<bs; j++) {
2236       x[bs*r[i]+j]   = t[bs*i+j];
2237     }
2238   }
2239 
2240   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2241   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2242   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2243   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2244   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2245   PetscFunctionReturn(0);
2246 }
2247 
2248 /* bs = 15 for PFLOTRAN */
2249 
2250 #undef __FUNCT__
2251 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering"
2252 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx)
2253 {
2254   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2255   PetscErrorCode ierr;
2256   const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2257   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2258   MatScalar      *aa=a->a,*v;
2259   PetscScalar    s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2260   PetscScalar    x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2261   PetscScalar    *x,*b,*t;
2262 
2263   PetscFunctionBegin;
2264   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2265   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2266   t  = a->solve_work;
2267 
2268   /* forward solve the lower triangular */
2269   idx    = 0;
2270   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2271   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2272   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2273 
2274   for (i=1; i<n; i++) {
2275     v     = aa + bs2*ai[i];
2276     vi    = aj + ai[i];
2277     nz    = ai[i+1] - ai[i];
2278     idx   = bs*i;
2279     s1   = b[idx];    s2  = b[1+idx];  s3  = b[2+idx];  s4  = b[3+idx];  s5  = b[4+idx];
2280     s6   = b[5+idx];  s7  = b[6+idx];  s8  = b[7+idx];  s9  = b[8+idx];  s10 = b[9+idx];
2281     s11  = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx];
2282     for(m=0;m<nz;m++){
2283       idx   = bs*vi[m];
2284       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2285       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2286       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2287 
2288       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2289       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2290       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2291       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2292       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2293       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2294       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2295       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2296       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2297       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2298       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2299       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2300       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2301       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2302       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2303 
2304       v += bs2;
2305     }
2306     idx = bs*i;
2307     t[idx]    = s1;  t[1+idx]  = s2;  t[2+idx]  = s3;  t[3+idx]  = s4;  t[4+idx]  = s5;
2308     t[5+idx]  = s6;  t[6+idx]  = s7;  t[7+idx]  = s8;  t[8+idx]  = s9;  t[9+idx]  = s10;
2309     t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15;
2310 
2311   }
2312   /* backward solve the upper triangular */
2313   for (i=n-1; i>=0; i--){
2314     v    = aa + bs2*(adiag[i+1]+1);
2315     vi   = aj + adiag[i+1]+1;
2316     nz   = adiag[i] - adiag[i+1] - 1;
2317     idt  = bs*i;
2318     s1   = t[idt];     s2  = t[1+idt];  s3  = t[2+idt];  s4  = t[3+idt];  s5  = t[4+idt];
2319     s6   = t[5+idt];   s7  = t[6+idt];  s8  = t[7+idt];  s9  = t[8+idt];  s10 = t[9+idt];
2320     s11  = t[10+idt]; s12  = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt];
2321 
2322     for(m=0;m<nz;m++){
2323       idx   = bs*vi[m];
2324       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2325       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2326       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2327 
2328       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2329       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2330       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2331       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2332       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2333       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2334       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2335       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2336       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2337       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2338       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2339       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2340       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2341       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2342       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2343 
2344       v += bs2;
2345     }
2346     idc = bs*i;
2347 
2348     x[idc]    = t[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2349     x[1+idc]  = t[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2350     x[2+idc]  = t[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2351     x[3+idc]  = t[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2352     x[4+idc]  = t[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2353     x[5+idc]  = t[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2354     x[6+idc]  = t[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2355     x[7+idc]  = t[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2356     x[8+idc]  = t[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2357     x[9+idc]  = t[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2358     x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2359     x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2360     x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2361     x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2362     x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2363 
2364   }
2365 
2366   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2367   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2368   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2369   PetscFunctionReturn(0);
2370 }
2371 
2372 #undef __FUNCT__
2373 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2374 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2375 {
2376   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2377   IS             iscol=a->col,isrow=a->row;
2378   PetscErrorCode ierr;
2379   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
2380   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
2381   MatScalar      *aa=a->a,*v;
2382   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2383   PetscScalar    *x,*b,*t;
2384 
2385   PetscFunctionBegin;
2386   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2387   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2388   t  = a->solve_work;
2389 
2390   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2391   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2392 
2393   /* forward solve the lower triangular */
2394   idx    = 7*(*r++);
2395   t[0] = b[idx];   t[1] = b[1+idx];
2396   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2397   t[5] = b[5+idx]; t[6] = b[6+idx];
2398 
2399   for (i=1; i<n; i++) {
2400     v     = aa + 49*ai[i];
2401     vi    = aj + ai[i];
2402     nz    = diag[i] - ai[i];
2403     idx   = 7*(*r++);
2404     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2405     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2406     while (nz--) {
2407       idx   = 7*(*vi++);
2408       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2409       x4    = t[3+idx];x5 = t[4+idx];
2410       x6    = t[5+idx];x7 = t[6+idx];
2411       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2412       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2413       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2414       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2415       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2416       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2417       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2418       v += 49;
2419     }
2420     idx = 7*i;
2421     t[idx]   = s1;t[1+idx] = s2;
2422     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2423     t[5+idx] = s6;t[6+idx] = s7;
2424   }
2425   /* backward solve the upper triangular */
2426   for (i=n-1; i>=0; i--){
2427     v    = aa + 49*diag[i] + 49;
2428     vi   = aj + diag[i] + 1;
2429     nz   = ai[i+1] - diag[i] - 1;
2430     idt  = 7*i;
2431     s1 = t[idt];  s2 = t[1+idt];
2432     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2433     s6 = t[5+idt];s7 = t[6+idt];
2434     while (nz--) {
2435       idx   = 7*(*vi++);
2436       x1    = t[idx];   x2 = t[1+idx];
2437       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2438       x6    = t[5+idx]; x7 = t[6+idx];
2439       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2440       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2441       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2442       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2443       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2444       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2445       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2446       v += 49;
2447     }
2448     idc = 7*(*c--);
2449     v   = aa + 49*diag[i];
2450     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2451                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2452     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2453                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2454     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2455                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2456     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2457                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2458     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2459                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2460     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2461                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2462     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2463                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2464   }
2465 
2466   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2467   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2468   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2469   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2470   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2471   PetscFunctionReturn(0);
2472 }
2473 
2474 #undef __FUNCT__
2475 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2476 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2477 {
2478   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2479   IS             iscol=a->col,isrow=a->row;
2480   PetscErrorCode ierr;
2481   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
2482   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2483   MatScalar      *aa=a->a,*v;
2484   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2485   PetscScalar    *x,*b,*t;
2486 
2487   PetscFunctionBegin;
2488   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2489   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2490   t  = a->solve_work;
2491 
2492   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2493   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2494 
2495   /* forward solve the lower triangular */
2496   idx    = 7*r[0];
2497   t[0] = b[idx];   t[1] = b[1+idx];
2498   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2499   t[5] = b[5+idx]; t[6] = b[6+idx];
2500 
2501   for (i=1; i<n; i++) {
2502     v     = aa + 49*ai[i];
2503     vi    = aj + ai[i];
2504     nz    = ai[i+1] - ai[i];
2505     idx   = 7*r[i];
2506     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2507     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2508     for(m=0;m<nz;m++){
2509       idx   = 7*vi[m];
2510       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2511       x4    = t[3+idx];x5 = t[4+idx];
2512       x6    = t[5+idx];x7 = t[6+idx];
2513       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2514       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2515       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2516       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2517       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2518       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2519       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2520       v += 49;
2521     }
2522     idx = 7*i;
2523     t[idx]   = s1;t[1+idx] = s2;
2524     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2525     t[5+idx] = s6;t[6+idx] = s7;
2526   }
2527   /* backward solve the upper triangular */
2528   for (i=n-1; i>=0; i--){
2529     v    = aa + 49*(adiag[i+1]+1);
2530     vi   = aj + adiag[i+1]+1;
2531     nz   = adiag[i] - adiag[i+1] - 1;
2532     idt  = 7*i;
2533     s1 = t[idt];  s2 = t[1+idt];
2534     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2535     s6 = t[5+idt];s7 = t[6+idt];
2536     for(m=0;m<nz;m++){
2537       idx   = 7*vi[m];
2538       x1    = t[idx];   x2 = t[1+idx];
2539       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2540       x6    = t[5+idx]; x7 = t[6+idx];
2541       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2542       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2543       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2544       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2545       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2546       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2547       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2548       v += 49;
2549     }
2550     idc = 7*c[i];
2551     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2552                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2553     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2554                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2555     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2556                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2557     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2558                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2559     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2560                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2561     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2562                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2563     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2564                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2565   }
2566 
2567   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2568   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2569   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2570   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2571   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2572   PetscFunctionReturn(0);
2573 }
2574 
2575 #undef __FUNCT__
2576 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2577 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2578 {
2579   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2580   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2581   PetscErrorCode    ierr;
2582   PetscInt          *diag = a->diag,jdx;
2583   const MatScalar   *aa=a->a,*v;
2584   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2585   const PetscScalar *b;
2586 
2587   PetscFunctionBegin;
2588   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2589   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2590   /* forward solve the lower triangular */
2591   idx    = 0;
2592   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2593   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2594   x[6] = b[6+idx];
2595   for (i=1; i<n; i++) {
2596     v     =  aa + 49*ai[i];
2597     vi    =  aj + ai[i];
2598     nz    =  diag[i] - ai[i];
2599     idx   =  7*i;
2600     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2601     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2602     s7  =  b[6+idx];
2603     while (nz--) {
2604       jdx   = 7*(*vi++);
2605       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2606       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2607       x7    = x[6+jdx];
2608       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2609       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2610       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2611       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2612       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2613       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2614       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2615       v += 49;
2616      }
2617     x[idx]   = s1;
2618     x[1+idx] = s2;
2619     x[2+idx] = s3;
2620     x[3+idx] = s4;
2621     x[4+idx] = s5;
2622     x[5+idx] = s6;
2623     x[6+idx] = s7;
2624   }
2625   /* backward solve the upper triangular */
2626   for (i=n-1; i>=0; i--){
2627     v    = aa + 49*diag[i] + 49;
2628     vi   = aj + diag[i] + 1;
2629     nz   = ai[i+1] - diag[i] - 1;
2630     idt  = 7*i;
2631     s1 = x[idt];   s2 = x[1+idt];
2632     s3 = x[2+idt]; s4 = x[3+idt];
2633     s5 = x[4+idt]; s6 = x[5+idt];
2634     s7 = x[6+idt];
2635     while (nz--) {
2636       idx   = 7*(*vi++);
2637       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2638       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2639       x7    = x[6+idx];
2640       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2641       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2642       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2643       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2644       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2645       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2646       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2647       v += 49;
2648     }
2649     v        = aa + 49*diag[i];
2650     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2651                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2652     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2653                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2654     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2655                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2656     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2657                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2658     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2659                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2660     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2661                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2662     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2663                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2664   }
2665 
2666   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2667   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2668   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 #undef __FUNCT__
2673 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2674 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2675 {
2676     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2677     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2678     PetscErrorCode    ierr;
2679     PetscInt          idx,jdx,idt;
2680     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2681     const MatScalar   *aa=a->a,*v;
2682     PetscScalar       *x;
2683     const PetscScalar *b;
2684     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2685 
2686     PetscFunctionBegin;
2687     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2688     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2689     /* forward solve the lower triangular */
2690     idx    = 0;
2691     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2692     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2693     for (i=1; i<n; i++) {
2694        v    = aa + bs2*ai[i];
2695        vi   = aj + ai[i];
2696        nz   = ai[i+1] - ai[i];
2697       idx   = bs*i;
2698        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2699        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2700        for(k=0;k<nz;k++) {
2701           jdx   = bs*vi[k];
2702           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2703 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2704           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2705           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2706           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2707 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2708           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2709 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2710 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2711           v   +=  bs2;
2712         }
2713 
2714        x[idx]   = s1;
2715        x[1+idx] = s2;
2716        x[2+idx] = s3;
2717        x[3+idx] = s4;
2718        x[4+idx] = s5;
2719        x[5+idx] = s6;
2720        x[6+idx] = s7;
2721     }
2722 
2723    /* backward solve the upper triangular */
2724   for (i=n-1; i>=0; i--){
2725     v   = aa + bs2*(adiag[i+1]+1);
2726      vi  = aj + adiag[i+1]+1;
2727      nz  = adiag[i] - adiag[i+1]-1;
2728      idt = bs*i;
2729      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2730      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2731     for(k=0;k<nz;k++) {
2732       idx   = bs*vi[k];
2733        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2734        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2735        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2736        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2737        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2738        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2739        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2740        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2741        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2742         v   +=  bs2;
2743     }
2744     /* x = inv_diagonal*x */
2745     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2746     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2747     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2748     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2749     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2750     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2751     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2752   }
2753 
2754   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2755   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2756   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2757   PetscFunctionReturn(0);
2758 }
2759 
2760 #undef __FUNCT__
2761 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2762 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2763 {
2764   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2765   IS                iscol=a->col,isrow=a->row;
2766   PetscErrorCode    ierr;
2767   const PetscInt    *r,*c,*rout,*cout;
2768   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2769   const MatScalar   *aa=a->a,*v;
2770   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2771   const PetscScalar *b;
2772   PetscFunctionBegin;
2773   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2774   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2775   t  = a->solve_work;
2776 
2777   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2778   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2779 
2780   /* forward solve the lower triangular */
2781   idx    = 6*(*r++);
2782   t[0] = b[idx];   t[1] = b[1+idx];
2783   t[2] = b[2+idx]; t[3] = b[3+idx];
2784   t[4] = b[4+idx]; t[5] = b[5+idx];
2785   for (i=1; i<n; i++) {
2786     v     = aa + 36*ai[i];
2787     vi    = aj + ai[i];
2788     nz    = diag[i] - ai[i];
2789     idx   = 6*(*r++);
2790     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2791     s5  = b[4+idx]; s6 = b[5+idx];
2792     while (nz--) {
2793       idx   = 6*(*vi++);
2794       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2795       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2796       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2797       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2798       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2799       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2800       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2801       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2802       v += 36;
2803     }
2804     idx = 6*i;
2805     t[idx]   = s1;t[1+idx] = s2;
2806     t[2+idx] = s3;t[3+idx] = s4;
2807     t[4+idx] = s5;t[5+idx] = s6;
2808   }
2809   /* backward solve the upper triangular */
2810   for (i=n-1; i>=0; i--){
2811     v    = aa + 36*diag[i] + 36;
2812     vi   = aj + diag[i] + 1;
2813     nz   = ai[i+1] - diag[i] - 1;
2814     idt  = 6*i;
2815     s1 = t[idt];  s2 = t[1+idt];
2816     s3 = t[2+idt];s4 = t[3+idt];
2817     s5 = t[4+idt];s6 = t[5+idt];
2818     while (nz--) {
2819       idx   = 6*(*vi++);
2820       x1    = t[idx];   x2 = t[1+idx];
2821       x3    = t[2+idx]; x4 = t[3+idx];
2822       x5    = t[4+idx]; x6 = t[5+idx];
2823       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2824       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2825       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2826       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2827       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2828       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2829       v += 36;
2830     }
2831     idc = 6*(*c--);
2832     v   = aa + 36*diag[i];
2833     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2834                                  v[18]*s4+v[24]*s5+v[30]*s6;
2835     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2836                                  v[19]*s4+v[25]*s5+v[31]*s6;
2837     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2838                                  v[20]*s4+v[26]*s5+v[32]*s6;
2839     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2840                                  v[21]*s4+v[27]*s5+v[33]*s6;
2841     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2842                                  v[22]*s4+v[28]*s5+v[34]*s6;
2843     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2844                                  v[23]*s4+v[29]*s5+v[35]*s6;
2845   }
2846 
2847   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2848   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2849   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2851   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2852   PetscFunctionReturn(0);
2853 }
2854 
2855 #undef __FUNCT__
2856 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2857 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2858 {
2859   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2860   IS                iscol=a->col,isrow=a->row;
2861   PetscErrorCode    ierr;
2862   const PetscInt    *r,*c,*rout,*cout;
2863   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2864   const MatScalar   *aa=a->a,*v;
2865   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2866   const PetscScalar *b;
2867   PetscFunctionBegin;
2868   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2869   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2870   t  = a->solve_work;
2871 
2872   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2873   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2874 
2875   /* forward solve the lower triangular */
2876   idx    = 6*r[0];
2877   t[0] = b[idx];   t[1] = b[1+idx];
2878   t[2] = b[2+idx]; t[3] = b[3+idx];
2879   t[4] = b[4+idx]; t[5] = b[5+idx];
2880   for (i=1; i<n; i++) {
2881     v     = aa + 36*ai[i];
2882     vi    = aj + ai[i];
2883     nz    = ai[i+1] - ai[i];
2884     idx   = 6*r[i];
2885     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2886     s5  = b[4+idx]; s6 = b[5+idx];
2887     for(m=0;m<nz;m++){
2888       idx   = 6*vi[m];
2889       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2890       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2891       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2892       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2893       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2894       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2895       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2896       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2897       v += 36;
2898     }
2899     idx = 6*i;
2900     t[idx]   = s1;t[1+idx] = s2;
2901     t[2+idx] = s3;t[3+idx] = s4;
2902     t[4+idx] = s5;t[5+idx] = s6;
2903   }
2904   /* backward solve the upper triangular */
2905   for (i=n-1; i>=0; i--){
2906     v    = aa + 36*(adiag[i+1]+1);
2907     vi   = aj + adiag[i+1]+1;
2908     nz   = adiag[i] - adiag[i+1] - 1;
2909     idt  = 6*i;
2910     s1 = t[idt];  s2 = t[1+idt];
2911     s3 = t[2+idt];s4 = t[3+idt];
2912     s5 = t[4+idt];s6 = t[5+idt];
2913     for(m=0;m<nz;m++){
2914       idx   = 6*vi[m];
2915       x1    = t[idx];   x2 = t[1+idx];
2916       x3    = t[2+idx]; x4 = t[3+idx];
2917       x5    = t[4+idx]; x6 = t[5+idx];
2918       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2919       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2920       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2921       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2922       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2923       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2924       v += 36;
2925     }
2926     idc = 6*c[i];
2927     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2928                                  v[18]*s4+v[24]*s5+v[30]*s6;
2929     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2930                                  v[19]*s4+v[25]*s5+v[31]*s6;
2931     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2932                                  v[20]*s4+v[26]*s5+v[32]*s6;
2933     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2934                                  v[21]*s4+v[27]*s5+v[33]*s6;
2935     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2936                                  v[22]*s4+v[28]*s5+v[34]*s6;
2937     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2938                                  v[23]*s4+v[29]*s5+v[35]*s6;
2939   }
2940 
2941   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2942   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2943   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2944   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2945   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2946   PetscFunctionReturn(0);
2947 }
2948 
2949 #undef __FUNCT__
2950 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
2951 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2952 {
2953   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2954   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2955   PetscErrorCode    ierr;
2956   PetscInt          *diag = a->diag,jdx;
2957   const MatScalar   *aa=a->a,*v;
2958   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2959   const PetscScalar *b;
2960 
2961   PetscFunctionBegin;
2962   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2963   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2964   /* forward solve the lower triangular */
2965   idx    = 0;
2966   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2967   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2968   for (i=1; i<n; i++) {
2969     v     =  aa + 36*ai[i];
2970     vi    =  aj + ai[i];
2971     nz    =  diag[i] - ai[i];
2972     idx   =  6*i;
2973     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2974     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2975     while (nz--) {
2976       jdx   = 6*(*vi++);
2977       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2978       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2979       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2980       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2981       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2982       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2983       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2984       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2985       v += 36;
2986      }
2987     x[idx]   = s1;
2988     x[1+idx] = s2;
2989     x[2+idx] = s3;
2990     x[3+idx] = s4;
2991     x[4+idx] = s5;
2992     x[5+idx] = s6;
2993   }
2994   /* backward solve the upper triangular */
2995   for (i=n-1; i>=0; i--){
2996     v    = aa + 36*diag[i] + 36;
2997     vi   = aj + diag[i] + 1;
2998     nz   = ai[i+1] - diag[i] - 1;
2999     idt  = 6*i;
3000     s1 = x[idt];   s2 = x[1+idt];
3001     s3 = x[2+idt]; s4 = x[3+idt];
3002     s5 = x[4+idt]; s6 = x[5+idt];
3003     while (nz--) {
3004       idx   = 6*(*vi++);
3005       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3006       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3007       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3008       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3009       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3010       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3011       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3012       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3013       v += 36;
3014     }
3015     v        = aa + 36*diag[i];
3016     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3017     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3018     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3019     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3020     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3021     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3022   }
3023 
3024   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3025   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3026   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3027   PetscFunctionReturn(0);
3028 }
3029 
3030 #undef __FUNCT__
3031 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3032 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3033 {
3034     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3035     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3036     PetscErrorCode    ierr;
3037     PetscInt          idx,jdx,idt;
3038     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3039     const MatScalar   *aa=a->a,*v;
3040     PetscScalar       *x;
3041     const PetscScalar *b;
3042     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3043 
3044     PetscFunctionBegin;
3045     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3046     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3047     /* forward solve the lower triangular */
3048     idx    = 0;
3049     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3050     x[4] = b[4+idx];x[5] = b[5+idx];
3051     for (i=1; i<n; i++) {
3052        v    = aa + bs2*ai[i];
3053        vi   = aj + ai[i];
3054        nz   = ai[i+1] - ai[i];
3055       idx   = bs*i;
3056        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3057        s5   = b[4+idx];s6 = b[5+idx];
3058        for(k=0;k<nz;k++){
3059           jdx   = bs*vi[k];
3060           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3061 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3062           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3063           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3064           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3065 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3066           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3067 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3068           v   +=  bs2;
3069         }
3070 
3071        x[idx]   = s1;
3072        x[1+idx] = s2;
3073        x[2+idx] = s3;
3074        x[3+idx] = s4;
3075        x[4+idx] = s5;
3076        x[5+idx] = s6;
3077     }
3078 
3079    /* backward solve the upper triangular */
3080   for (i=n-1; i>=0; i--){
3081     v   = aa + bs2*(adiag[i+1]+1);
3082      vi  = aj + adiag[i+1]+1;
3083      nz  = adiag[i] - adiag[i+1]-1;
3084      idt = bs*i;
3085      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3086      s5 = x[4+idt];s6 = x[5+idt];
3087      for(k=0;k<nz;k++){
3088       idx   = bs*vi[k];
3089        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3090        x5    = x[4+idx];x6 = x[5+idx];
3091        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3092        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3093        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3094        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3095        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3096        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3097         v   +=  bs2;
3098     }
3099     /* x = inv_diagonal*x */
3100    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3101    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3102    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3103    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3104    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3105    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3106   }
3107 
3108   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3109   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3110   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3111   PetscFunctionReturn(0);
3112 }
3113 
3114 #undef __FUNCT__
3115 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3116 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3117 {
3118   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3119   IS                iscol=a->col,isrow=a->row;
3120   PetscErrorCode    ierr;
3121   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3122   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3123   const MatScalar   *aa=a->a,*v;
3124   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3125   const PetscScalar *b;
3126 
3127   PetscFunctionBegin;
3128   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3130   t  = a->solve_work;
3131 
3132   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3133   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3134 
3135   /* forward solve the lower triangular */
3136   idx    = 5*(*r++);
3137   t[0] = b[idx];   t[1] = b[1+idx];
3138   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3139   for (i=1; i<n; i++) {
3140     v     = aa + 25*ai[i];
3141     vi    = aj + ai[i];
3142     nz    = diag[i] - ai[i];
3143     idx   = 5*(*r++);
3144     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3145     s5  = b[4+idx];
3146     while (nz--) {
3147       idx   = 5*(*vi++);
3148       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3149       x4    = t[3+idx];x5 = t[4+idx];
3150       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3151       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3152       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3153       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3154       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3155       v += 25;
3156     }
3157     idx = 5*i;
3158     t[idx]   = s1;t[1+idx] = s2;
3159     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3160   }
3161   /* backward solve the upper triangular */
3162   for (i=n-1; i>=0; i--){
3163     v    = aa + 25*diag[i] + 25;
3164     vi   = aj + diag[i] + 1;
3165     nz   = ai[i+1] - diag[i] - 1;
3166     idt  = 5*i;
3167     s1 = t[idt];  s2 = t[1+idt];
3168     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3169     while (nz--) {
3170       idx   = 5*(*vi++);
3171       x1    = t[idx];   x2 = t[1+idx];
3172       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3173       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3174       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3175       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3176       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3177       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3178       v += 25;
3179     }
3180     idc = 5*(*c--);
3181     v   = aa + 25*diag[i];
3182     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3183                                  v[15]*s4+v[20]*s5;
3184     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3185                                  v[16]*s4+v[21]*s5;
3186     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3187                                  v[17]*s4+v[22]*s5;
3188     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3189                                  v[18]*s4+v[23]*s5;
3190     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3191                                  v[19]*s4+v[24]*s5;
3192   }
3193 
3194   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3195   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3196   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3197   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3198   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3199   PetscFunctionReturn(0);
3200 }
3201 
3202 #undef __FUNCT__
3203 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3204 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3205 {
3206   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3207   IS                iscol=a->col,isrow=a->row;
3208   PetscErrorCode    ierr;
3209   const PetscInt    *r,*c,*rout,*cout;
3210   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3211   const MatScalar   *aa=a->a,*v;
3212   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3213   const PetscScalar *b;
3214 
3215   PetscFunctionBegin;
3216   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3217   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3218   t  = a->solve_work;
3219 
3220   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3221   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3222 
3223   /* forward solve the lower triangular */
3224   idx    = 5*r[0];
3225   t[0] = b[idx];   t[1] = b[1+idx];
3226   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3227   for (i=1; i<n; i++) {
3228     v     = aa + 25*ai[i];
3229     vi    = aj + ai[i];
3230     nz    = ai[i+1] - ai[i];
3231     idx   = 5*r[i];
3232     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3233     s5  = b[4+idx];
3234     for(m=0;m<nz;m++){
3235       idx   = 5*vi[m];
3236       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3237       x4    = t[3+idx];x5 = t[4+idx];
3238       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3239       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3240       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3241       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3242       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3243       v += 25;
3244     }
3245     idx = 5*i;
3246     t[idx]   = s1;t[1+idx] = s2;
3247     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3248   }
3249   /* backward solve the upper triangular */
3250   for (i=n-1; i>=0; i--){
3251     v    = aa + 25*(adiag[i+1]+1);
3252     vi   = aj + adiag[i+1]+1;
3253     nz   = adiag[i] - adiag[i+1] - 1;
3254     idt  = 5*i;
3255     s1 = t[idt];  s2 = t[1+idt];
3256     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3257     for(m=0;m<nz;m++){
3258       idx   = 5*vi[m];
3259       x1    = t[idx];   x2 = t[1+idx];
3260       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3261       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3262       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3263       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3264       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3265       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3266       v += 25;
3267     }
3268     idc = 5*c[i];
3269     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3270                                  v[15]*s4+v[20]*s5;
3271     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3272                                  v[16]*s4+v[21]*s5;
3273     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3274                                  v[17]*s4+v[22]*s5;
3275     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3276                                  v[18]*s4+v[23]*s5;
3277     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3278                                  v[19]*s4+v[24]*s5;
3279   }
3280 
3281   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3282   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3283   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3284   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3285   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3286   PetscFunctionReturn(0);
3287 }
3288 
3289 #undef __FUNCT__
3290 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3291 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3292 {
3293   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3294   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3295   PetscErrorCode    ierr;
3296   PetscInt          *diag = a->diag,jdx;
3297   const MatScalar   *aa=a->a,*v;
3298   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3299   const PetscScalar *b;
3300 
3301   PetscFunctionBegin;
3302   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3303   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3304   /* forward solve the lower triangular */
3305   idx    = 0;
3306   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3307   for (i=1; i<n; i++) {
3308     v     =  aa + 25*ai[i];
3309     vi    =  aj + ai[i];
3310     nz    =  diag[i] - ai[i];
3311     idx   =  5*i;
3312     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3313     while (nz--) {
3314       jdx   = 5*(*vi++);
3315       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3316       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3317       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3318       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3319       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3320       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3321       v    += 25;
3322     }
3323     x[idx]   = s1;
3324     x[1+idx] = s2;
3325     x[2+idx] = s3;
3326     x[3+idx] = s4;
3327     x[4+idx] = s5;
3328   }
3329   /* backward solve the upper triangular */
3330   for (i=n-1; i>=0; i--){
3331     v    = aa + 25*diag[i] + 25;
3332     vi   = aj + diag[i] + 1;
3333     nz   = ai[i+1] - diag[i] - 1;
3334     idt  = 5*i;
3335     s1 = x[idt];  s2 = x[1+idt];
3336     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3337     while (nz--) {
3338       idx   = 5*(*vi++);
3339       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3340       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3341       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3342       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3343       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3344       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3345       v    += 25;
3346     }
3347     v        = aa + 25*diag[i];
3348     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3349     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3350     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3351     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3352     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3353   }
3354 
3355   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3356   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3357   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 #undef __FUNCT__
3362 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3363 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3364 {
3365   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3366   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3367   PetscErrorCode    ierr;
3368   PetscInt          jdx;
3369   const MatScalar   *aa=a->a,*v;
3370   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3371   const PetscScalar *b;
3372 
3373   PetscFunctionBegin;
3374   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3375   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3376   /* forward solve the lower triangular */
3377   idx    = 0;
3378   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3379   for (i=1; i<n; i++) {
3380     v   = aa + 25*ai[i];
3381     vi  = aj + ai[i];
3382     nz  = ai[i+1] - ai[i];
3383     idx = 5*i;
3384     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3385     for(k=0;k<nz;k++) {
3386       jdx   = 5*vi[k];
3387       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3388       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3389       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3390       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3391       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3392       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3393       v    += 25;
3394     }
3395     x[idx]   = s1;
3396     x[1+idx] = s2;
3397     x[2+idx] = s3;
3398     x[3+idx] = s4;
3399     x[4+idx] = s5;
3400   }
3401 
3402   /* backward solve the upper triangular */
3403   for (i=n-1; i>=0; i--){
3404     v   = aa + 25*(adiag[i+1]+1);
3405     vi  = aj + adiag[i+1]+1;
3406     nz  = adiag[i] - adiag[i+1]-1;
3407     idt = 5*i;
3408     s1 = x[idt];  s2 = x[1+idt];
3409     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3410     for(k=0;k<nz;k++){
3411       idx   = 5*vi[k];
3412       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3413       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3414       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3415       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3416       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3417       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3418       v    += 25;
3419     }
3420     /* x = inv_diagonal*x */
3421     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3422     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3423     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3424     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3425     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3426   }
3427 
3428   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3429   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3430   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3431   PetscFunctionReturn(0);
3432 }
3433 
3434 #undef __FUNCT__
3435 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3436 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3437 {
3438   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3439   IS                iscol=a->col,isrow=a->row;
3440   PetscErrorCode    ierr;
3441   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3442   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3443   const MatScalar   *aa=a->a,*v;
3444   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3445   const PetscScalar *b;
3446 
3447   PetscFunctionBegin;
3448   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3450   t  = a->solve_work;
3451 
3452   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3453   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3454 
3455   /* forward solve the lower triangular */
3456   idx    = 4*(*r++);
3457   t[0] = b[idx];   t[1] = b[1+idx];
3458   t[2] = b[2+idx]; t[3] = b[3+idx];
3459   for (i=1; i<n; i++) {
3460     v     = aa + 16*ai[i];
3461     vi    = aj + ai[i];
3462     nz    = diag[i] - ai[i];
3463     idx   = 4*(*r++);
3464     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3465     while (nz--) {
3466       idx   = 4*(*vi++);
3467       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3468       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3469       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3470       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3471       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3472       v    += 16;
3473     }
3474     idx        = 4*i;
3475     t[idx]   = s1;t[1+idx] = s2;
3476     t[2+idx] = s3;t[3+idx] = s4;
3477   }
3478   /* backward solve the upper triangular */
3479   for (i=n-1; i>=0; i--){
3480     v    = aa + 16*diag[i] + 16;
3481     vi   = aj + diag[i] + 1;
3482     nz   = ai[i+1] - diag[i] - 1;
3483     idt  = 4*i;
3484     s1 = t[idt];  s2 = t[1+idt];
3485     s3 = t[2+idt];s4 = t[3+idt];
3486     while (nz--) {
3487       idx   = 4*(*vi++);
3488       x1    = t[idx];   x2 = t[1+idx];
3489       x3    = t[2+idx]; x4 = t[3+idx];
3490       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3491       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3492       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3493       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3494       v += 16;
3495     }
3496     idc      = 4*(*c--);
3497     v        = aa + 16*diag[i];
3498     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3499     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3500     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3501     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3502   }
3503 
3504   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3505   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3506   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3507   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3508   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3509   PetscFunctionReturn(0);
3510 }
3511 
3512 #undef __FUNCT__
3513 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3514 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3515 {
3516   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3517   IS                iscol=a->col,isrow=a->row;
3518   PetscErrorCode    ierr;
3519   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3520   const PetscInt    *r,*c,*rout,*cout;
3521   const MatScalar   *aa=a->a,*v;
3522   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3523   const PetscScalar *b;
3524 
3525   PetscFunctionBegin;
3526   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3527   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3528   t  = a->solve_work;
3529 
3530   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3531   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3532 
3533   /* forward solve the lower triangular */
3534   idx    = 4*r[0];
3535   t[0] = b[idx];   t[1] = b[1+idx];
3536   t[2] = b[2+idx]; t[3] = b[3+idx];
3537   for (i=1; i<n; i++) {
3538     v     = aa + 16*ai[i];
3539     vi    = aj + ai[i];
3540     nz    = ai[i+1] - ai[i];
3541     idx   = 4*r[i];
3542     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3543     for(m=0;m<nz;m++){
3544       idx   = 4*vi[m];
3545       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3546       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3547       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3548       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3549       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3550       v    += 16;
3551     }
3552     idx        = 4*i;
3553     t[idx]   = s1;t[1+idx] = s2;
3554     t[2+idx] = s3;t[3+idx] = s4;
3555   }
3556   /* backward solve the upper triangular */
3557   for (i=n-1; i>=0; i--){
3558     v    = aa + 16*(adiag[i+1]+1);
3559     vi   = aj + adiag[i+1]+1;
3560     nz   = adiag[i] - adiag[i+1] - 1;
3561     idt  = 4*i;
3562     s1 = t[idt];  s2 = t[1+idt];
3563     s3 = t[2+idt];s4 = t[3+idt];
3564     for(m=0;m<nz;m++){
3565       idx   = 4*vi[m];
3566       x1    = t[idx];   x2 = t[1+idx];
3567       x3    = t[2+idx]; x4 = t[3+idx];
3568       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3569       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3570       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3571       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3572       v += 16;
3573     }
3574     idc      = 4*c[i];
3575     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3576     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3577     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3578     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3579   }
3580 
3581   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3582   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3583   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3584   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3585   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3586   PetscFunctionReturn(0);
3587 }
3588 
3589 #undef __FUNCT__
3590 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3591 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3592 {
3593   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3594   IS                iscol=a->col,isrow=a->row;
3595   PetscErrorCode    ierr;
3596   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3597   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3598   const MatScalar   *aa=a->a,*v;
3599   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3600   PetscScalar       *x;
3601   const PetscScalar *b;
3602 
3603   PetscFunctionBegin;
3604   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3605   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3606   t  = (MatScalar *)a->solve_work;
3607 
3608   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3609   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3610 
3611   /* forward solve the lower triangular */
3612   idx    = 4*(*r++);
3613   t[0] = (MatScalar)b[idx];
3614   t[1] = (MatScalar)b[1+idx];
3615   t[2] = (MatScalar)b[2+idx];
3616   t[3] = (MatScalar)b[3+idx];
3617   for (i=1; i<n; i++) {
3618     v     = aa + 16*ai[i];
3619     vi    = aj + ai[i];
3620     nz    = diag[i] - ai[i];
3621     idx   = 4*(*r++);
3622     s1 = (MatScalar)b[idx];
3623     s2 = (MatScalar)b[1+idx];
3624     s3 = (MatScalar)b[2+idx];
3625     s4 = (MatScalar)b[3+idx];
3626     while (nz--) {
3627       idx   = 4*(*vi++);
3628       x1  = t[idx];
3629       x2  = t[1+idx];
3630       x3  = t[2+idx];
3631       x4  = t[3+idx];
3632       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3633       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3634       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3635       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3636       v    += 16;
3637     }
3638     idx        = 4*i;
3639     t[idx]   = s1;
3640     t[1+idx] = s2;
3641     t[2+idx] = s3;
3642     t[3+idx] = s4;
3643   }
3644   /* backward solve the upper triangular */
3645   for (i=n-1; i>=0; i--){
3646     v    = aa + 16*diag[i] + 16;
3647     vi   = aj + diag[i] + 1;
3648     nz   = ai[i+1] - diag[i] - 1;
3649     idt  = 4*i;
3650     s1 = t[idt];
3651     s2 = t[1+idt];
3652     s3 = t[2+idt];
3653     s4 = t[3+idt];
3654     while (nz--) {
3655       idx   = 4*(*vi++);
3656       x1  = t[idx];
3657       x2  = t[1+idx];
3658       x3  = t[2+idx];
3659       x4  = t[3+idx];
3660       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3661       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3662       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3663       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3664       v += 16;
3665     }
3666     idc      = 4*(*c--);
3667     v        = aa + 16*diag[i];
3668     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3669     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3670     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3671     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3672     x[idc]   = (PetscScalar)t[idt];
3673     x[1+idc] = (PetscScalar)t[1+idt];
3674     x[2+idc] = (PetscScalar)t[2+idt];
3675     x[3+idc] = (PetscScalar)t[3+idt];
3676  }
3677 
3678   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3679   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3680   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3681   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3682   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3683   PetscFunctionReturn(0);
3684 }
3685 
3686 #if defined (PETSC_HAVE_SSE)
3687 
3688 #include PETSC_HAVE_SSE
3689 
3690 #undef __FUNCT__
3691 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3692 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3693 {
3694   /*
3695      Note: This code uses demotion of double
3696      to float when performing the mixed-mode computation.
3697      This may not be numerically reasonable for all applications.
3698   */
3699   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3700   IS             iscol=a->col,isrow=a->row;
3701   PetscErrorCode ierr;
3702   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3703   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3704   MatScalar      *aa=a->a,*v;
3705   PetscScalar    *x,*b,*t;
3706 
3707   /* Make space in temp stack for 16 Byte Aligned arrays */
3708   float           ssealignedspace[11],*tmps,*tmpx;
3709   unsigned long   offset;
3710 
3711   PetscFunctionBegin;
3712   SSE_SCOPE_BEGIN;
3713 
3714     offset = (unsigned long)ssealignedspace % 16;
3715     if (offset) offset = (16 - offset)/4;
3716     tmps = &ssealignedspace[offset];
3717     tmpx = &ssealignedspace[offset+4];
3718     PREFETCH_NTA(aa+16*ai[1]);
3719 
3720     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3721     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3722     t  = a->solve_work;
3723 
3724     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3725     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3726 
3727     /* forward solve the lower triangular */
3728     idx  = 4*(*r++);
3729     t[0] = b[idx];   t[1] = b[1+idx];
3730     t[2] = b[2+idx]; t[3] = b[3+idx];
3731     v    =  aa + 16*ai[1];
3732 
3733     for (i=1; i<n;) {
3734       PREFETCH_NTA(&v[8]);
3735       vi   =  aj      + ai[i];
3736       nz   =  diag[i] - ai[i];
3737       idx  =  4*(*r++);
3738 
3739       /* Demote sum from double to float */
3740       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3741       LOAD_PS(tmps,XMM7);
3742 
3743       while (nz--) {
3744         PREFETCH_NTA(&v[16]);
3745         idx = 4*(*vi++);
3746 
3747         /* Demote solution (so far) from double to float */
3748         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3749 
3750         /* 4x4 Matrix-Vector product with negative accumulation: */
3751         SSE_INLINE_BEGIN_2(tmpx,v)
3752           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3753 
3754           /* First Column */
3755           SSE_COPY_PS(XMM0,XMM6)
3756           SSE_SHUFFLE(XMM0,XMM0,0x00)
3757           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3758           SSE_SUB_PS(XMM7,XMM0)
3759 
3760           /* Second Column */
3761           SSE_COPY_PS(XMM1,XMM6)
3762           SSE_SHUFFLE(XMM1,XMM1,0x55)
3763           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3764           SSE_SUB_PS(XMM7,XMM1)
3765 
3766           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3767 
3768           /* Third Column */
3769           SSE_COPY_PS(XMM2,XMM6)
3770           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3771           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3772           SSE_SUB_PS(XMM7,XMM2)
3773 
3774           /* Fourth Column */
3775           SSE_COPY_PS(XMM3,XMM6)
3776           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3777           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3778           SSE_SUB_PS(XMM7,XMM3)
3779         SSE_INLINE_END_2
3780 
3781         v  += 16;
3782       }
3783       idx = 4*i;
3784       v   = aa + 16*ai[++i];
3785       PREFETCH_NTA(v);
3786       STORE_PS(tmps,XMM7);
3787 
3788       /* Promote result from float to double */
3789       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3790     }
3791     /* backward solve the upper triangular */
3792     idt  = 4*(n-1);
3793     ai16 = 16*diag[n-1];
3794     v    = aa + ai16 + 16;
3795     for (i=n-1; i>=0;){
3796       PREFETCH_NTA(&v[8]);
3797       vi = aj + diag[i] + 1;
3798       nz = ai[i+1] - diag[i] - 1;
3799 
3800       /* Demote accumulator from double to float */
3801       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3802       LOAD_PS(tmps,XMM7);
3803 
3804       while (nz--) {
3805         PREFETCH_NTA(&v[16]);
3806         idx = 4*(*vi++);
3807 
3808         /* Demote solution (so far) from double to float */
3809         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3810 
3811         /* 4x4 Matrix-Vector Product with negative accumulation: */
3812         SSE_INLINE_BEGIN_2(tmpx,v)
3813           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3814 
3815           /* First Column */
3816           SSE_COPY_PS(XMM0,XMM6)
3817           SSE_SHUFFLE(XMM0,XMM0,0x00)
3818           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3819           SSE_SUB_PS(XMM7,XMM0)
3820 
3821           /* Second Column */
3822           SSE_COPY_PS(XMM1,XMM6)
3823           SSE_SHUFFLE(XMM1,XMM1,0x55)
3824           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3825           SSE_SUB_PS(XMM7,XMM1)
3826 
3827           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3828 
3829           /* Third Column */
3830           SSE_COPY_PS(XMM2,XMM6)
3831           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3832           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3833           SSE_SUB_PS(XMM7,XMM2)
3834 
3835           /* Fourth Column */
3836           SSE_COPY_PS(XMM3,XMM6)
3837           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3838           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3839           SSE_SUB_PS(XMM7,XMM3)
3840         SSE_INLINE_END_2
3841         v  += 16;
3842       }
3843       v    = aa + ai16;
3844       ai16 = 16*diag[--i];
3845       PREFETCH_NTA(aa+ai16+16);
3846       /*
3847          Scale the result by the diagonal 4x4 block,
3848          which was inverted as part of the factorization
3849       */
3850       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3851         /* First Column */
3852         SSE_COPY_PS(XMM0,XMM7)
3853         SSE_SHUFFLE(XMM0,XMM0,0x00)
3854         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3855 
3856         /* Second Column */
3857         SSE_COPY_PS(XMM1,XMM7)
3858         SSE_SHUFFLE(XMM1,XMM1,0x55)
3859         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3860         SSE_ADD_PS(XMM0,XMM1)
3861 
3862         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3863 
3864         /* Third Column */
3865         SSE_COPY_PS(XMM2,XMM7)
3866         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3867         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3868         SSE_ADD_PS(XMM0,XMM2)
3869 
3870         /* Fourth Column */
3871         SSE_COPY_PS(XMM3,XMM7)
3872         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3873         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3874         SSE_ADD_PS(XMM0,XMM3)
3875 
3876         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3877       SSE_INLINE_END_3
3878 
3879       /* Promote solution from float to double */
3880       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3881 
3882       /* Apply reordering to t and stream into x.    */
3883       /* This way, x doesn't pollute the cache.      */
3884       /* Be careful with size: 2 doubles = 4 floats! */
3885       idc  = 4*(*c--);
3886       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3887         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3888         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3889         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3890         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3891         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3892         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3893       SSE_INLINE_END_2
3894       v    = aa + ai16 + 16;
3895       idt -= 4;
3896     }
3897 
3898     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3899     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3900     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3901     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3902     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3903   SSE_SCOPE_END;
3904   PetscFunctionReturn(0);
3905 }
3906 
3907 #endif
3908 
3909 
3910 /*
3911       Special case where the matrix was ILU(0) factored in the natural
3912    ordering. This eliminates the need for the column and row permutation.
3913 */
3914 #undef __FUNCT__
3915 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
3916 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3917 {
3918   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3919   PetscInt          n=a->mbs;
3920   const PetscInt    *ai=a->i,*aj=a->j;
3921   PetscErrorCode    ierr;
3922   const PetscInt    *diag = a->diag;
3923   const MatScalar   *aa=a->a;
3924   PetscScalar       *x;
3925   const PetscScalar *b;
3926 
3927   PetscFunctionBegin;
3928   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3929   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3930 
3931 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3932   {
3933     static PetscScalar w[2000]; /* very BAD need to fix */
3934     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3935   }
3936 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3937   {
3938     static PetscScalar w[2000]; /* very BAD need to fix */
3939     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3940   }
3941 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3942   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3943 #else
3944   {
3945     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3946     const MatScalar *v;
3947     PetscInt        jdx,idt,idx,nz,i,ai16;
3948     const PetscInt  *vi;
3949 
3950   /* forward solve the lower triangular */
3951   idx    = 0;
3952   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3953   for (i=1; i<n; i++) {
3954     v     =  aa      + 16*ai[i];
3955     vi    =  aj      + ai[i];
3956     nz    =  diag[i] - ai[i];
3957     idx   +=  4;
3958     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3959     while (nz--) {
3960       jdx   = 4*(*vi++);
3961       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3962       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3963       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3964       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3965       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3966       v    += 16;
3967     }
3968     x[idx]   = s1;
3969     x[1+idx] = s2;
3970     x[2+idx] = s3;
3971     x[3+idx] = s4;
3972   }
3973   /* backward solve the upper triangular */
3974   idt = 4*(n-1);
3975   for (i=n-1; i>=0; i--){
3976     ai16 = 16*diag[i];
3977     v    = aa + ai16 + 16;
3978     vi   = aj + diag[i] + 1;
3979     nz   = ai[i+1] - diag[i] - 1;
3980     s1 = x[idt];  s2 = x[1+idt];
3981     s3 = x[2+idt];s4 = x[3+idt];
3982     while (nz--) {
3983       idx   = 4*(*vi++);
3984       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3985       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3986       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3987       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3988       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3989       v    += 16;
3990     }
3991     v        = aa + ai16;
3992     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3993     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3994     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3995     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3996     idt -= 4;
3997   }
3998   }
3999 #endif
4000 
4001   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4002   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4003   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4004   PetscFunctionReturn(0);
4005 }
4006 
4007 #undef __FUNCT__
4008 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4009 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4010 {
4011     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4012     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4013     PetscErrorCode    ierr;
4014     PetscInt          idx,jdx,idt;
4015     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4016     const MatScalar   *aa=a->a,*v;
4017     PetscScalar       *x;
4018     const PetscScalar *b;
4019     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4020 
4021     PetscFunctionBegin;
4022     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4023     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4024     /* forward solve the lower triangular */
4025     idx    = 0;
4026     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4027     for (i=1; i<n; i++) {
4028        v    = aa + bs2*ai[i];
4029        vi   = aj + ai[i];
4030        nz   = ai[i+1] - ai[i];
4031       idx   = bs*i;
4032        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4033       for(k=0;k<nz;k++) {
4034           jdx   = bs*vi[k];
4035           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4036           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4037           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4038           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4039 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4040 
4041           v   +=  bs2;
4042         }
4043 
4044        x[idx]   = s1;
4045        x[1+idx] = s2;
4046        x[2+idx] = s3;
4047        x[3+idx] = s4;
4048     }
4049 
4050    /* backward solve the upper triangular */
4051   for (i=n-1; i>=0; i--){
4052     v   = aa + bs2*(adiag[i+1]+1);
4053      vi  = aj + adiag[i+1]+1;
4054      nz  = adiag[i] - adiag[i+1]-1;
4055      idt = bs*i;
4056      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4057 
4058     for(k=0;k<nz;k++){
4059       idx   = bs*vi[k];
4060        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4061        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4062        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4063        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4064        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4065 
4066         v   +=  bs2;
4067     }
4068     /* x = inv_diagonal*x */
4069    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4070    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4071    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4072    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4073 
4074   }
4075 
4076   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4077   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4078   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4079   PetscFunctionReturn(0);
4080 }
4081 
4082 #undef __FUNCT__
4083 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4084 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4085 {
4086   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4087   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4088   PetscErrorCode ierr;
4089   PetscInt       *diag = a->diag;
4090   MatScalar      *aa=a->a;
4091   PetscScalar    *x,*b;
4092 
4093   PetscFunctionBegin;
4094   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4095   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4096 
4097   {
4098     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
4099     MatScalar  *v,*t=(MatScalar *)x;
4100     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
4101 
4102     /* forward solve the lower triangular */
4103     idx  = 0;
4104     t[0] = (MatScalar)b[0];
4105     t[1] = (MatScalar)b[1];
4106     t[2] = (MatScalar)b[2];
4107     t[3] = (MatScalar)b[3];
4108     for (i=1; i<n; i++) {
4109       v     =  aa      + 16*ai[i];
4110       vi    =  aj      + ai[i];
4111       nz    =  diag[i] - ai[i];
4112       idx   +=  4;
4113       s1 = (MatScalar)b[idx];
4114       s2 = (MatScalar)b[1+idx];
4115       s3 = (MatScalar)b[2+idx];
4116       s4 = (MatScalar)b[3+idx];
4117       while (nz--) {
4118         jdx = 4*(*vi++);
4119         x1  = t[jdx];
4120         x2  = t[1+jdx];
4121         x3  = t[2+jdx];
4122         x4  = t[3+jdx];
4123         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4124         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4125         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4126         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4127         v    += 16;
4128       }
4129       t[idx]   = s1;
4130       t[1+idx] = s2;
4131       t[2+idx] = s3;
4132       t[3+idx] = s4;
4133     }
4134     /* backward solve the upper triangular */
4135     idt = 4*(n-1);
4136     for (i=n-1; i>=0; i--){
4137       ai16 = 16*diag[i];
4138       v    = aa + ai16 + 16;
4139       vi   = aj + diag[i] + 1;
4140       nz   = ai[i+1] - diag[i] - 1;
4141       s1   = t[idt];
4142       s2   = t[1+idt];
4143       s3   = t[2+idt];
4144       s4   = t[3+idt];
4145       while (nz--) {
4146         idx = 4*(*vi++);
4147         x1  = (MatScalar)x[idx];
4148         x2  = (MatScalar)x[1+idx];
4149         x3  = (MatScalar)x[2+idx];
4150         x4  = (MatScalar)x[3+idx];
4151         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4152         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4153         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4154         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4155         v    += 16;
4156       }
4157       v        = aa + ai16;
4158       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4159       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4160       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4161       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4162       idt -= 4;
4163     }
4164   }
4165 
4166   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4168   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4169   PetscFunctionReturn(0);
4170 }
4171 
4172 #if defined (PETSC_HAVE_SSE)
4173 
4174 #include PETSC_HAVE_SSE
4175 #undef __FUNCT__
4176 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4177 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4178 {
4179   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4180   unsigned short *aj=(unsigned short *)a->j;
4181   PetscErrorCode ierr;
4182   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4183   MatScalar      *aa=a->a;
4184   PetscScalar    *x,*b;
4185 
4186   PetscFunctionBegin;
4187   SSE_SCOPE_BEGIN;
4188   /*
4189      Note: This code currently uses demotion of double
4190      to float when performing the mixed-mode computation.
4191      This may not be numerically reasonable for all applications.
4192   */
4193   PREFETCH_NTA(aa+16*ai[1]);
4194 
4195   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4196   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4197   {
4198     /* x will first be computed in single precision then promoted inplace to double */
4199     MatScalar      *v,*t=(MatScalar *)x;
4200     int            nz,i,idt,ai16;
4201     unsigned int   jdx,idx;
4202     unsigned short *vi;
4203     /* Forward solve the lower triangular factor. */
4204 
4205     /* First block is the identity. */
4206     idx  = 0;
4207     CONVERT_DOUBLE4_FLOAT4(t,b);
4208     v    =  aa + 16*((unsigned int)ai[1]);
4209 
4210     for (i=1; i<n;) {
4211       PREFETCH_NTA(&v[8]);
4212       vi   =  aj      + ai[i];
4213       nz   =  diag[i] - ai[i];
4214       idx +=  4;
4215 
4216       /* Demote RHS from double to float. */
4217       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4218       LOAD_PS(&t[idx],XMM7);
4219 
4220       while (nz--) {
4221         PREFETCH_NTA(&v[16]);
4222         jdx = 4*((unsigned int)(*vi++));
4223 
4224         /* 4x4 Matrix-Vector product with negative accumulation: */
4225         SSE_INLINE_BEGIN_2(&t[jdx],v)
4226           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4227 
4228           /* First Column */
4229           SSE_COPY_PS(XMM0,XMM6)
4230           SSE_SHUFFLE(XMM0,XMM0,0x00)
4231           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4232           SSE_SUB_PS(XMM7,XMM0)
4233 
4234           /* Second Column */
4235           SSE_COPY_PS(XMM1,XMM6)
4236           SSE_SHUFFLE(XMM1,XMM1,0x55)
4237           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4238           SSE_SUB_PS(XMM7,XMM1)
4239 
4240           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4241 
4242           /* Third Column */
4243           SSE_COPY_PS(XMM2,XMM6)
4244           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4245           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4246           SSE_SUB_PS(XMM7,XMM2)
4247 
4248           /* Fourth Column */
4249           SSE_COPY_PS(XMM3,XMM6)
4250           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4251           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4252           SSE_SUB_PS(XMM7,XMM3)
4253         SSE_INLINE_END_2
4254 
4255         v  += 16;
4256       }
4257       v    =  aa + 16*ai[++i];
4258       PREFETCH_NTA(v);
4259       STORE_PS(&t[idx],XMM7);
4260     }
4261 
4262     /* Backward solve the upper triangular factor.*/
4263 
4264     idt  = 4*(n-1);
4265     ai16 = 16*diag[n-1];
4266     v    = aa + ai16 + 16;
4267     for (i=n-1; i>=0;){
4268       PREFETCH_NTA(&v[8]);
4269       vi = aj + diag[i] + 1;
4270       nz = ai[i+1] - diag[i] - 1;
4271 
4272       LOAD_PS(&t[idt],XMM7);
4273 
4274       while (nz--) {
4275         PREFETCH_NTA(&v[16]);
4276         idx = 4*((unsigned int)(*vi++));
4277 
4278         /* 4x4 Matrix-Vector Product with negative accumulation: */
4279         SSE_INLINE_BEGIN_2(&t[idx],v)
4280           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4281 
4282           /* First Column */
4283           SSE_COPY_PS(XMM0,XMM6)
4284           SSE_SHUFFLE(XMM0,XMM0,0x00)
4285           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4286           SSE_SUB_PS(XMM7,XMM0)
4287 
4288           /* Second Column */
4289           SSE_COPY_PS(XMM1,XMM6)
4290           SSE_SHUFFLE(XMM1,XMM1,0x55)
4291           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4292           SSE_SUB_PS(XMM7,XMM1)
4293 
4294           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4295 
4296           /* Third Column */
4297           SSE_COPY_PS(XMM2,XMM6)
4298           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4299           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4300           SSE_SUB_PS(XMM7,XMM2)
4301 
4302           /* Fourth Column */
4303           SSE_COPY_PS(XMM3,XMM6)
4304           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4305           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4306           SSE_SUB_PS(XMM7,XMM3)
4307         SSE_INLINE_END_2
4308         v  += 16;
4309       }
4310       v    = aa + ai16;
4311       ai16 = 16*diag[--i];
4312       PREFETCH_NTA(aa+ai16+16);
4313       /*
4314          Scale the result by the diagonal 4x4 block,
4315          which was inverted as part of the factorization
4316       */
4317       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4318         /* First Column */
4319         SSE_COPY_PS(XMM0,XMM7)
4320         SSE_SHUFFLE(XMM0,XMM0,0x00)
4321         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4322 
4323         /* Second Column */
4324         SSE_COPY_PS(XMM1,XMM7)
4325         SSE_SHUFFLE(XMM1,XMM1,0x55)
4326         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4327         SSE_ADD_PS(XMM0,XMM1)
4328 
4329         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4330 
4331         /* Third Column */
4332         SSE_COPY_PS(XMM2,XMM7)
4333         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4334         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4335         SSE_ADD_PS(XMM0,XMM2)
4336 
4337         /* Fourth Column */
4338         SSE_COPY_PS(XMM3,XMM7)
4339         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4340         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4341         SSE_ADD_PS(XMM0,XMM3)
4342 
4343         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4344       SSE_INLINE_END_3
4345 
4346       v    = aa + ai16 + 16;
4347       idt -= 4;
4348     }
4349 
4350     /* Convert t from single precision back to double precision (inplace)*/
4351     idt = 4*(n-1);
4352     for (i=n-1;i>=0;i--) {
4353       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4354       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4355       PetscScalar *xtemp=&x[idt];
4356       MatScalar   *ttemp=&t[idt];
4357       xtemp[3] = (PetscScalar)ttemp[3];
4358       xtemp[2] = (PetscScalar)ttemp[2];
4359       xtemp[1] = (PetscScalar)ttemp[1];
4360       xtemp[0] = (PetscScalar)ttemp[0];
4361       idt -= 4;
4362     }
4363 
4364   } /* End of artificial scope. */
4365   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4366   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4367   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4368   SSE_SCOPE_END;
4369   PetscFunctionReturn(0);
4370 }
4371 
4372 #undef __FUNCT__
4373 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4374 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4375 {
4376   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4377   int            *aj=a->j;
4378   PetscErrorCode ierr;
4379   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4380   MatScalar      *aa=a->a;
4381   PetscScalar    *x,*b;
4382 
4383   PetscFunctionBegin;
4384   SSE_SCOPE_BEGIN;
4385   /*
4386      Note: This code currently uses demotion of double
4387      to float when performing the mixed-mode computation.
4388      This may not be numerically reasonable for all applications.
4389   */
4390   PREFETCH_NTA(aa+16*ai[1]);
4391 
4392   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4393   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4394   {
4395     /* x will first be computed in single precision then promoted inplace to double */
4396     MatScalar *v,*t=(MatScalar *)x;
4397     int       nz,i,idt,ai16;
4398     int       jdx,idx;
4399     int       *vi;
4400     /* Forward solve the lower triangular factor. */
4401 
4402     /* First block is the identity. */
4403     idx  = 0;
4404     CONVERT_DOUBLE4_FLOAT4(t,b);
4405     v    =  aa + 16*ai[1];
4406 
4407     for (i=1; i<n;) {
4408       PREFETCH_NTA(&v[8]);
4409       vi   =  aj      + ai[i];
4410       nz   =  diag[i] - ai[i];
4411       idx +=  4;
4412 
4413       /* Demote RHS from double to float. */
4414       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4415       LOAD_PS(&t[idx],XMM7);
4416 
4417       while (nz--) {
4418         PREFETCH_NTA(&v[16]);
4419         jdx = 4*(*vi++);
4420 /*          jdx = *vi++; */
4421 
4422         /* 4x4 Matrix-Vector product with negative accumulation: */
4423         SSE_INLINE_BEGIN_2(&t[jdx],v)
4424           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4425 
4426           /* First Column */
4427           SSE_COPY_PS(XMM0,XMM6)
4428           SSE_SHUFFLE(XMM0,XMM0,0x00)
4429           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4430           SSE_SUB_PS(XMM7,XMM0)
4431 
4432           /* Second Column */
4433           SSE_COPY_PS(XMM1,XMM6)
4434           SSE_SHUFFLE(XMM1,XMM1,0x55)
4435           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4436           SSE_SUB_PS(XMM7,XMM1)
4437 
4438           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4439 
4440           /* Third Column */
4441           SSE_COPY_PS(XMM2,XMM6)
4442           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4443           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4444           SSE_SUB_PS(XMM7,XMM2)
4445 
4446           /* Fourth Column */
4447           SSE_COPY_PS(XMM3,XMM6)
4448           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4449           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4450           SSE_SUB_PS(XMM7,XMM3)
4451         SSE_INLINE_END_2
4452 
4453         v  += 16;
4454       }
4455       v    =  aa + 16*ai[++i];
4456       PREFETCH_NTA(v);
4457       STORE_PS(&t[idx],XMM7);
4458     }
4459 
4460     /* Backward solve the upper triangular factor.*/
4461 
4462     idt  = 4*(n-1);
4463     ai16 = 16*diag[n-1];
4464     v    = aa + ai16 + 16;
4465     for (i=n-1; i>=0;){
4466       PREFETCH_NTA(&v[8]);
4467       vi = aj + diag[i] + 1;
4468       nz = ai[i+1] - diag[i] - 1;
4469 
4470       LOAD_PS(&t[idt],XMM7);
4471 
4472       while (nz--) {
4473         PREFETCH_NTA(&v[16]);
4474         idx = 4*(*vi++);
4475 /*          idx = *vi++; */
4476 
4477         /* 4x4 Matrix-Vector Product with negative accumulation: */
4478         SSE_INLINE_BEGIN_2(&t[idx],v)
4479           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4480 
4481           /* First Column */
4482           SSE_COPY_PS(XMM0,XMM6)
4483           SSE_SHUFFLE(XMM0,XMM0,0x00)
4484           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4485           SSE_SUB_PS(XMM7,XMM0)
4486 
4487           /* Second Column */
4488           SSE_COPY_PS(XMM1,XMM6)
4489           SSE_SHUFFLE(XMM1,XMM1,0x55)
4490           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4491           SSE_SUB_PS(XMM7,XMM1)
4492 
4493           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4494 
4495           /* Third Column */
4496           SSE_COPY_PS(XMM2,XMM6)
4497           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4498           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4499           SSE_SUB_PS(XMM7,XMM2)
4500 
4501           /* Fourth Column */
4502           SSE_COPY_PS(XMM3,XMM6)
4503           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4504           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4505           SSE_SUB_PS(XMM7,XMM3)
4506         SSE_INLINE_END_2
4507         v  += 16;
4508       }
4509       v    = aa + ai16;
4510       ai16 = 16*diag[--i];
4511       PREFETCH_NTA(aa+ai16+16);
4512       /*
4513          Scale the result by the diagonal 4x4 block,
4514          which was inverted as part of the factorization
4515       */
4516       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4517         /* First Column */
4518         SSE_COPY_PS(XMM0,XMM7)
4519         SSE_SHUFFLE(XMM0,XMM0,0x00)
4520         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4521 
4522         /* Second Column */
4523         SSE_COPY_PS(XMM1,XMM7)
4524         SSE_SHUFFLE(XMM1,XMM1,0x55)
4525         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4526         SSE_ADD_PS(XMM0,XMM1)
4527 
4528         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4529 
4530         /* Third Column */
4531         SSE_COPY_PS(XMM2,XMM7)
4532         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4533         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4534         SSE_ADD_PS(XMM0,XMM2)
4535 
4536         /* Fourth Column */
4537         SSE_COPY_PS(XMM3,XMM7)
4538         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4539         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4540         SSE_ADD_PS(XMM0,XMM3)
4541 
4542         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4543       SSE_INLINE_END_3
4544 
4545       v    = aa + ai16 + 16;
4546       idt -= 4;
4547     }
4548 
4549     /* Convert t from single precision back to double precision (inplace)*/
4550     idt = 4*(n-1);
4551     for (i=n-1;i>=0;i--) {
4552       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4553       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4554       PetscScalar *xtemp=&x[idt];
4555       MatScalar   *ttemp=&t[idt];
4556       xtemp[3] = (PetscScalar)ttemp[3];
4557       xtemp[2] = (PetscScalar)ttemp[2];
4558       xtemp[1] = (PetscScalar)ttemp[1];
4559       xtemp[0] = (PetscScalar)ttemp[0];
4560       idt -= 4;
4561     }
4562 
4563   } /* End of artificial scope. */
4564   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4565   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4566   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4567   SSE_SCOPE_END;
4568   PetscFunctionReturn(0);
4569 }
4570 
4571 #endif
4572 
4573 #undef __FUNCT__
4574 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4575 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4576 {
4577   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4578   IS                iscol=a->col,isrow=a->row;
4579   PetscErrorCode    ierr;
4580   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4581   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4582   const MatScalar   *aa=a->a,*v;
4583   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4584   const PetscScalar *b;
4585 
4586   PetscFunctionBegin;
4587   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4588   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4589   t  = a->solve_work;
4590 
4591   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4592   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4593 
4594   /* forward solve the lower triangular */
4595   idx    = 3*(*r++);
4596   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4597   for (i=1; i<n; i++) {
4598     v     = aa + 9*ai[i];
4599     vi    = aj + ai[i];
4600     nz    = diag[i] - ai[i];
4601     idx   = 3*(*r++);
4602     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4603     while (nz--) {
4604       idx   = 3*(*vi++);
4605       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4606       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4607       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4608       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4609       v += 9;
4610     }
4611     idx = 3*i;
4612     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4613   }
4614   /* backward solve the upper triangular */
4615   for (i=n-1; i>=0; i--){
4616     v    = aa + 9*diag[i] + 9;
4617     vi   = aj + diag[i] + 1;
4618     nz   = ai[i+1] - diag[i] - 1;
4619     idt  = 3*i;
4620     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4621     while (nz--) {
4622       idx   = 3*(*vi++);
4623       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4624       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4625       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4626       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4627       v += 9;
4628     }
4629     idc = 3*(*c--);
4630     v   = aa + 9*diag[i];
4631     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4632     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4633     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4634   }
4635   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4636   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4637   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4638   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4639   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4640   PetscFunctionReturn(0);
4641 }
4642 
4643 #undef __FUNCT__
4644 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4645 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4646 {
4647   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4648   IS                iscol=a->col,isrow=a->row;
4649   PetscErrorCode    ierr;
4650   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4651   const PetscInt    *r,*c,*rout,*cout;
4652   const MatScalar   *aa=a->a,*v;
4653   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4654   const PetscScalar *b;
4655 
4656   PetscFunctionBegin;
4657   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4658   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4659   t  = a->solve_work;
4660 
4661   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4662   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4663 
4664   /* forward solve the lower triangular */
4665   idx    = 3*r[0];
4666   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4667   for (i=1; i<n; i++) {
4668     v     = aa + 9*ai[i];
4669     vi    = aj + ai[i];
4670     nz    = ai[i+1] - ai[i];
4671     idx   = 3*r[i];
4672     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4673     for(m=0;m<nz;m++){
4674       idx   = 3*vi[m];
4675       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4676       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4677       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4678       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4679       v += 9;
4680     }
4681     idx = 3*i;
4682     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4683   }
4684   /* backward solve the upper triangular */
4685   for (i=n-1; i>=0; i--){
4686     v    = aa + 9*(adiag[i+1]+1);
4687     vi   = aj + adiag[i+1]+1;
4688     nz   = adiag[i] - adiag[i+1] - 1;
4689     idt  = 3*i;
4690     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4691     for(m=0;m<nz;m++){
4692       idx   = 3*vi[m];
4693       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4694       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4695       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4696       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4697       v += 9;
4698     }
4699     idc = 3*c[i];
4700     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4701     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4702     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4703   }
4704   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4705   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4706   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4707   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4708   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4709   PetscFunctionReturn(0);
4710 }
4711 
4712 /*
4713       Special case where the matrix was ILU(0) factored in the natural
4714    ordering. This eliminates the need for the column and row permutation.
4715 */
4716 #undef __FUNCT__
4717 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4718 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4719 {
4720   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4721   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4722   PetscErrorCode    ierr;
4723   PetscInt          *diag = a->diag;
4724   const MatScalar   *aa=a->a,*v;
4725   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4726   const PetscScalar *b;
4727   PetscInt          jdx,idt,idx,nz,*vi,i;
4728 
4729   PetscFunctionBegin;
4730   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4731   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4732 
4733   /* forward solve the lower triangular */
4734   idx    = 0;
4735   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4736   for (i=1; i<n; i++) {
4737     v     =  aa      + 9*ai[i];
4738     vi    =  aj      + ai[i];
4739     nz    =  diag[i] - ai[i];
4740     idx   +=  3;
4741     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4742     while (nz--) {
4743       jdx   = 3*(*vi++);
4744       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4745       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4746       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4747       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4748       v    += 9;
4749     }
4750     x[idx]   = s1;
4751     x[1+idx] = s2;
4752     x[2+idx] = s3;
4753   }
4754   /* backward solve the upper triangular */
4755   for (i=n-1; i>=0; i--){
4756     v    = aa + 9*diag[i] + 9;
4757     vi   = aj + diag[i] + 1;
4758     nz   = ai[i+1] - diag[i] - 1;
4759     idt  = 3*i;
4760     s1 = x[idt];  s2 = x[1+idt];
4761     s3 = x[2+idt];
4762     while (nz--) {
4763       idx   = 3*(*vi++);
4764       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4765       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4766       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4767       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4768       v    += 9;
4769     }
4770     v        = aa +  9*diag[i];
4771     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4772     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4773     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4774   }
4775 
4776   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4777   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4778   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4779   PetscFunctionReturn(0);
4780 }
4781 
4782 #undef __FUNCT__
4783 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4784 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4785 {
4786     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4787     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4788     PetscErrorCode    ierr;
4789     PetscInt          idx,jdx,idt;
4790     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4791     const MatScalar   *aa=a->a,*v;
4792     PetscScalar       *x;
4793     const PetscScalar *b;
4794     PetscScalar        s1,s2,s3,x1,x2,x3;
4795 
4796     PetscFunctionBegin;
4797     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4798     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4799     /* forward solve the lower triangular */
4800     idx    = 0;
4801     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4802     for (i=1; i<n; i++) {
4803        v    = aa + bs2*ai[i];
4804        vi   = aj + ai[i];
4805        nz   = ai[i+1] - ai[i];
4806       idx   = bs*i;
4807        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4808       for(k=0;k<nz;k++){
4809          jdx   = bs*vi[k];
4810           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4811           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4812           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4813           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4814 
4815           v   +=  bs2;
4816         }
4817 
4818        x[idx]   = s1;
4819        x[1+idx] = s2;
4820        x[2+idx] = s3;
4821     }
4822 
4823    /* backward solve the upper triangular */
4824   for (i=n-1; i>=0; i--){
4825     v   = aa + bs2*(adiag[i+1]+1);
4826      vi  = aj + adiag[i+1]+1;
4827      nz  = adiag[i] - adiag[i+1]-1;
4828      idt = bs*i;
4829      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4830 
4831      for(k=0;k<nz;k++){
4832        idx   = bs*vi[k];
4833        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4834        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4835        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4836        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4837 
4838         v   +=  bs2;
4839     }
4840     /* x = inv_diagonal*x */
4841    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4842    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4843    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4844 
4845   }
4846 
4847   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4848   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4849   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4850   PetscFunctionReturn(0);
4851 }
4852 
4853 #undef __FUNCT__
4854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4855 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4856 {
4857   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4858   IS                iscol=a->col,isrow=a->row;
4859   PetscErrorCode    ierr;
4860   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4861   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4862   const MatScalar   *aa=a->a,*v;
4863   PetscScalar       *x,s1,s2,x1,x2,*t;
4864   const PetscScalar *b;
4865 
4866   PetscFunctionBegin;
4867   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4868   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4869   t  = a->solve_work;
4870 
4871   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4872   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4873 
4874   /* forward solve the lower triangular */
4875   idx    = 2*(*r++);
4876   t[0] = b[idx]; t[1] = b[1+idx];
4877   for (i=1; i<n; i++) {
4878     v     = aa + 4*ai[i];
4879     vi    = aj + ai[i];
4880     nz    = diag[i] - ai[i];
4881     idx   = 2*(*r++);
4882     s1  = b[idx]; s2 = b[1+idx];
4883     while (nz--) {
4884       idx   = 2*(*vi++);
4885       x1    = t[idx]; x2 = t[1+idx];
4886       s1 -= v[0]*x1 + v[2]*x2;
4887       s2 -= v[1]*x1 + v[3]*x2;
4888       v += 4;
4889     }
4890     idx = 2*i;
4891     t[idx] = s1; t[1+idx] = s2;
4892   }
4893   /* backward solve the upper triangular */
4894   for (i=n-1; i>=0; i--){
4895     v    = aa + 4*diag[i] + 4;
4896     vi   = aj + diag[i] + 1;
4897     nz   = ai[i+1] - diag[i] - 1;
4898     idt  = 2*i;
4899     s1 = t[idt]; s2 = t[1+idt];
4900     while (nz--) {
4901       idx   = 2*(*vi++);
4902       x1    = t[idx]; x2 = t[1+idx];
4903       s1 -= v[0]*x1 + v[2]*x2;
4904       s2 -= v[1]*x1 + v[3]*x2;
4905       v += 4;
4906     }
4907     idc = 2*(*c--);
4908     v   = aa + 4*diag[i];
4909     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4910     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4911   }
4912   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4913   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4914   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4915   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4916   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4917   PetscFunctionReturn(0);
4918 }
4919 
4920 #undef __FUNCT__
4921 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4922 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4923 {
4924   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4925   IS                iscol=a->col,isrow=a->row;
4926   PetscErrorCode    ierr;
4927   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4928   const PetscInt    *r,*c,*rout,*cout;
4929   const MatScalar   *aa=a->a,*v;
4930   PetscScalar       *x,s1,s2,x1,x2,*t;
4931   const PetscScalar *b;
4932 
4933   PetscFunctionBegin;
4934   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4936   t  = a->solve_work;
4937 
4938   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4939   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4940 
4941   /* forward solve the lower triangular */
4942   idx    = 2*r[0];
4943   t[0] = b[idx]; t[1] = b[1+idx];
4944   for (i=1; i<n; i++) {
4945     v     = aa + 4*ai[i];
4946     vi    = aj + ai[i];
4947     nz    = ai[i+1] - ai[i];
4948     idx   = 2*r[i];
4949     s1  = b[idx]; s2 = b[1+idx];
4950     for(m=0;m<nz;m++){
4951       jdx   = 2*vi[m];
4952       x1    = t[jdx]; x2 = t[1+jdx];
4953       s1 -= v[0]*x1 + v[2]*x2;
4954       s2 -= v[1]*x1 + v[3]*x2;
4955       v += 4;
4956     }
4957     idx = 2*i;
4958     t[idx] = s1; t[1+idx] = s2;
4959   }
4960   /* backward solve the upper triangular */
4961   for (i=n-1; i>=0; i--){
4962     v    = aa + 4*(adiag[i+1]+1);
4963     vi   = aj + adiag[i+1]+1;
4964     nz   = adiag[i] - adiag[i+1] - 1;
4965     idt  = 2*i;
4966     s1 = t[idt]; s2 = t[1+idt];
4967     for(m=0;m<nz;m++){
4968       idx   = 2*vi[m];
4969       x1    = t[idx]; x2 = t[1+idx];
4970       s1 -= v[0]*x1 + v[2]*x2;
4971       s2 -= v[1]*x1 + v[3]*x2;
4972       v += 4;
4973     }
4974     idc = 2*c[i];
4975     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4976     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4977   }
4978   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4979   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4980   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4981   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4982   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4983   PetscFunctionReturn(0);
4984 }
4985 
4986 /*
4987       Special case where the matrix was ILU(0) factored in the natural
4988    ordering. This eliminates the need for the column and row permutation.
4989 */
4990 #undef __FUNCT__
4991 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
4992 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4993 {
4994   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4995   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4996   PetscErrorCode    ierr;
4997   PetscInt          *diag = a->diag;
4998   const MatScalar   *aa=a->a,*v;
4999   PetscScalar       *x,s1,s2,x1,x2;
5000   const PetscScalar *b;
5001   PetscInt          jdx,idt,idx,nz,*vi,i;
5002 
5003   PetscFunctionBegin;
5004   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5005   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5006 
5007   /* forward solve the lower triangular */
5008   idx    = 0;
5009   x[0]   = b[0]; x[1] = b[1];
5010   for (i=1; i<n; i++) {
5011     v     =  aa      + 4*ai[i];
5012     vi    =  aj      + ai[i];
5013     nz    =  diag[i] - ai[i];
5014     idx   +=  2;
5015     s1  =  b[idx];s2 = b[1+idx];
5016     while (nz--) {
5017       jdx   = 2*(*vi++);
5018       x1    = x[jdx];x2 = x[1+jdx];
5019       s1 -= v[0]*x1 + v[2]*x2;
5020       s2 -= v[1]*x1 + v[3]*x2;
5021       v    += 4;
5022     }
5023     x[idx]   = s1;
5024     x[1+idx] = s2;
5025   }
5026   /* backward solve the upper triangular */
5027   for (i=n-1; i>=0; i--){
5028     v    = aa + 4*diag[i] + 4;
5029     vi   = aj + diag[i] + 1;
5030     nz   = ai[i+1] - diag[i] - 1;
5031     idt  = 2*i;
5032     s1 = x[idt];  s2 = x[1+idt];
5033     while (nz--) {
5034       idx   = 2*(*vi++);
5035       x1    = x[idx];   x2 = x[1+idx];
5036       s1 -= v[0]*x1 + v[2]*x2;
5037       s2 -= v[1]*x1 + v[3]*x2;
5038       v    += 4;
5039     }
5040     v        = aa +  4*diag[i];
5041     x[idt]   = v[0]*s1 + v[2]*s2;
5042     x[1+idt] = v[1]*s1 + v[3]*s2;
5043   }
5044 
5045   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5046   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5047   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5048   PetscFunctionReturn(0);
5049 }
5050 
5051 #undef __FUNCT__
5052 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5053 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5054 {
5055     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5056     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
5057     PetscErrorCode    ierr;
5058     PetscInt          jdx;
5059     const MatScalar   *aa=a->a,*v;
5060     PetscScalar       *x,s1,s2,x1,x2;
5061     const PetscScalar *b;
5062 
5063     PetscFunctionBegin;
5064     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5065     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5066     /* forward solve the lower triangular */
5067     idx    = 0;
5068     x[0] = b[idx]; x[1] = b[1+idx];
5069     for (i=1; i<n; i++) {
5070         v   = aa + 4*ai[i];
5071        vi   = aj + ai[i];
5072        nz   = ai[i+1] - ai[i];
5073        idx  = 2*i;
5074        s1   = b[idx];s2 = b[1+idx];
5075       for(k=0;k<nz;k++){
5076          jdx   = 2*vi[k];
5077           x1    = x[jdx];x2 = x[1+jdx];
5078           s1   -= v[0]*x1 + v[2]*x2;
5079           s2   -= v[1]*x1 + v[3]*x2;
5080            v   +=  4;
5081         }
5082        x[idx]   = s1;
5083        x[1+idx] = s2;
5084     }
5085 
5086    /* backward solve the upper triangular */
5087   for (i=n-1; i>=0; i--){
5088      v   = aa + 4*(adiag[i+1]+1);
5089      vi  = aj + adiag[i+1]+1;
5090      nz  = adiag[i] - adiag[i+1]-1;
5091      idt = 2*i;
5092      s1 = x[idt];  s2 = x[1+idt];
5093      for(k=0;k<nz;k++){
5094       idx   = 2*vi[k];
5095        x1    = x[idx];   x2 = x[1+idx];
5096        s1 -= v[0]*x1 + v[2]*x2;
5097        s2 -= v[1]*x1 + v[3]*x2;
5098          v    += 4;
5099     }
5100     /* x = inv_diagonal*x */
5101    x[idt]   = v[0]*s1 + v[2]*s2;
5102    x[1+idt] = v[1]*s1 + v[3]*s2;
5103   }
5104 
5105   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5106   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5107   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5108   PetscFunctionReturn(0);
5109 }
5110 
5111 #undef __FUNCT__
5112 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5113 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5114 {
5115   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5116   IS             iscol=a->col,isrow=a->row;
5117   PetscErrorCode ierr;
5118   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
5119   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5120   MatScalar      *aa=a->a,*v;
5121   PetscScalar    *x,*b,s1,*t;
5122 
5123   PetscFunctionBegin;
5124   if (!n) PetscFunctionReturn(0);
5125 
5126   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5128   t  = a->solve_work;
5129 
5130   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5131   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5132 
5133   /* forward solve the lower triangular */
5134   t[0] = b[*r++];
5135   for (i=1; i<n; i++) {
5136     v     = aa + ai[i];
5137     vi    = aj + ai[i];
5138     nz    = diag[i] - ai[i];
5139     s1  = b[*r++];
5140     while (nz--) {
5141       s1 -= (*v++)*t[*vi++];
5142     }
5143     t[i] = s1;
5144   }
5145   /* backward solve the upper triangular */
5146   for (i=n-1; i>=0; i--){
5147     v    = aa + diag[i] + 1;
5148     vi   = aj + diag[i] + 1;
5149     nz   = ai[i+1] - diag[i] - 1;
5150     s1 = t[i];
5151     while (nz--) {
5152       s1 -= (*v++)*t[*vi++];
5153     }
5154     x[*c--] = t[i] = aa[diag[i]]*s1;
5155   }
5156 
5157   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5158   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5159   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5160   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5161   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5162   PetscFunctionReturn(0);
5163 }
5164 /*
5165       Special case where the matrix was ILU(0) factored in the natural
5166    ordering. This eliminates the need for the column and row permutation.
5167 */
5168 #undef __FUNCT__
5169 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5170 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5171 {
5172   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5173   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5174   PetscErrorCode ierr;
5175   PetscInt       *diag = a->diag;
5176   MatScalar      *aa=a->a;
5177   PetscScalar    *x,*b;
5178   PetscScalar    s1,x1;
5179   MatScalar      *v;
5180   PetscInt       jdx,idt,idx,nz,*vi,i;
5181 
5182   PetscFunctionBegin;
5183   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5184   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5185 
5186   /* forward solve the lower triangular */
5187   idx    = 0;
5188   x[0]   = b[0];
5189   for (i=1; i<n; i++) {
5190     v     =  aa      + ai[i];
5191     vi    =  aj      + ai[i];
5192     nz    =  diag[i] - ai[i];
5193     idx   +=  1;
5194     s1  =  b[idx];
5195     while (nz--) {
5196       jdx   = *vi++;
5197       x1    = x[jdx];
5198       s1 -= v[0]*x1;
5199       v    += 1;
5200     }
5201     x[idx]   = s1;
5202   }
5203   /* backward solve the upper triangular */
5204   for (i=n-1; i>=0; i--){
5205     v    = aa + diag[i] + 1;
5206     vi   = aj + diag[i] + 1;
5207     nz   = ai[i+1] - diag[i] - 1;
5208     idt  = i;
5209     s1 = x[idt];
5210     while (nz--) {
5211       idx   = *vi++;
5212       x1    = x[idx];
5213       s1 -= v[0]*x1;
5214       v    += 1;
5215     }
5216     v        = aa +  diag[i];
5217     x[idt]   = v[0]*s1;
5218   }
5219   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5220   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5221   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5222   PetscFunctionReturn(0);
5223 }
5224 
5225 /* ----------------------------------------------------------------*/
5226 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5227 
5228 /* bs = 15 for PFLOTRAN */
5229 #undef __FUNCT__
5230 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5231 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5232 {
5233   Mat            C=B;
5234   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5235   PetscErrorCode ierr;
5236   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5237   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5238   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5239   PetscInt       bs2 = a->bs2,bs=A->rmap->bs,flg;
5240   PetscReal      shift = info->shiftinblocks;
5241 
5242   PetscFunctionBegin;
5243 
5244   /* generate work space needed by the factorization */
5245   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5246   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5247 
5248   for (i=0; i<n; i++){
5249     /* zero rtmp */
5250     /* L part */
5251     nz    = bi[i+1] - bi[i];
5252     bjtmp = bj + bi[i];
5253     for  (j=0; j<nz; j++){
5254       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5255     }
5256 
5257     /* U part */
5258     nz = bdiag[i] - bdiag[i+1];
5259     bjtmp = bj + bdiag[i+1]+1;
5260     for  (j=0; j<nz; j++){
5261       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5262     }
5263 
5264     /* load in initial (unfactored row) */
5265     nz    = ai[i+1] - ai[i];
5266     ajtmp = aj + ai[i];
5267     v     = aa + bs2*ai[i];
5268     for (j=0; j<nz; j++) {
5269       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5270     }
5271 
5272     /* elimination */
5273     bjtmp = bj + bi[i];
5274     nzL   = bi[i+1] - bi[i];
5275     for(k=0;k < nzL;k++) {
5276       row = bjtmp[k];
5277       pc = rtmp + bs2*row;
5278       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5279       if (flg) {
5280         pv = b->a + bs2*bdiag[row];
5281 	ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);
5282 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5283         pv = b->a + bs2*(bdiag[row+1]+1);
5284         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5285         for (j=0; j<nz; j++) {
5286           v    = rtmp + bs2*pj[j];
5287 	  ierr = Kernel_A_gets_A_minus_B_times_C_15(v,pc,pv);CHKERRQ(ierr);
5288 	  pv  += bs2;
5289         }
5290         ierr = PetscLogFlops(2*bs2*bs*nz+2*bs2*bs-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5291       }
5292     }
5293 
5294     /* finished row so stick it into b->a */
5295     /* L part */
5296     pv   = b->a + bs2*bi[i] ;
5297     pj   = b->j + bi[i] ;
5298     nz   = bi[i+1] - bi[i];
5299     for (j=0; j<nz; j++) {
5300       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5301     }
5302 
5303     /* Mark diagonal and invert diagonal for simplier triangular solves */
5304     pv   = b->a + bs2*bdiag[i];
5305     pj   = b->j + bdiag[i];
5306     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5307     ierr = Kernel_A_gets_inverse_A_15(pv,shift);CHKERRQ(ierr);
5308 
5309     /* U part */
5310     pv = b->a + bs2*(bdiag[i+1]+1);
5311     pj = b->j + bdiag[i+1]+1;
5312     nz = bdiag[i] - bdiag[i+1] - 1;
5313     for (j=0; j<nz; j++){
5314       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5315     }
5316   }
5317 
5318   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5319   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering;
5320   C->ops->solvetranspose = 0;
5321   C->assembled = PETSC_TRUE;
5322   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
5323   PetscFunctionReturn(0);
5324 }
5325 
5326 #undef __FUNCT__
5327 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5328 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5329 {
5330   Mat            C=B;
5331   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5332   IS             isrow = b->row,isicol = b->icol;
5333   PetscErrorCode ierr;
5334   const PetscInt *r,*ic,*ics;
5335   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5336   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5337   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5338   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5339   MatScalar      *v_work;
5340   PetscTruth     col_identity,row_identity,both_identity;
5341 
5342   PetscFunctionBegin;
5343   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5344   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5345 
5346   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5347   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5348   ics  = ic;
5349 
5350   /* generate work space needed by dense LU factorization */
5351   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5352 
5353   for (i=0; i<n; i++){
5354     /* zero rtmp */
5355     /* L part */
5356     nz    = bi[i+1] - bi[i];
5357     bjtmp = bj + bi[i];
5358     for  (j=0; j<nz; j++){
5359       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5360     }
5361 
5362     /* U part */
5363     nz = bdiag[i] - bdiag[i+1];
5364     bjtmp = bj + bdiag[i+1]+1;
5365     for  (j=0; j<nz; j++){
5366       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5367     }
5368 
5369     /* load in initial (unfactored row) */
5370     nz    = ai[r[i]+1] - ai[r[i]];
5371     ajtmp = aj + ai[r[i]];
5372     v     = aa + bs2*ai[r[i]];
5373     for (j=0; j<nz; j++) {
5374       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5375     }
5376 
5377     /* elimination */
5378     bjtmp = bj + bi[i];
5379     nzL   = bi[i+1] - bi[i];
5380     for(k=0;k < nzL;k++) {
5381       row = bjtmp[k];
5382       pc = rtmp + bs2*row;
5383       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5384       if (flg) {
5385         pv         = b->a + bs2*bdiag[row];
5386         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5387         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5388         pv         = b->a + bs2*(bdiag[row+1]+1);
5389         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5390         for (j=0; j<nz; j++) {
5391           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5392         }
5393         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5394       }
5395     }
5396 
5397     /* finished row so stick it into b->a */
5398     /* L part */
5399     pv   = b->a + bs2*bi[i] ;
5400     pj   = b->j + bi[i] ;
5401     nz   = bi[i+1] - bi[i];
5402     for (j=0; j<nz; j++) {
5403       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5404     }
5405 
5406     /* Mark diagonal and invert diagonal for simplier triangular solves */
5407     pv  = b->a + bs2*bdiag[i];
5408     pj  = b->j + bdiag[i];
5409     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5410     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5411     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5412 
5413     /* U part */
5414     pv = b->a + bs2*(bdiag[i+1]+1);
5415     pj = b->j + bdiag[i+1]+1;
5416     nz = bdiag[i] - bdiag[i+1] - 1;
5417     for (j=0; j<nz; j++){
5418       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5419     }
5420   }
5421 
5422   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5423   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5424   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5425   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5426 
5427   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5428   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5429   both_identity = (PetscTruth) (row_identity && col_identity);
5430   if (both_identity){
5431     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5432   } else {
5433     C->ops->solve = MatSolve_SeqBAIJ_N;
5434   }
5435   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5436 
5437   C->assembled = PETSC_TRUE;
5438   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5439   PetscFunctionReturn(0);
5440 }
5441 
5442 /*
5443    ilu(0) with natural ordering under new data structure.
5444    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5445    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5446 */
5447 
5448 #undef __FUNCT__
5449 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5450 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5451 {
5452 
5453   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5454   PetscErrorCode     ierr;
5455   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5456   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5457 
5458   PetscFunctionBegin;
5459   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5460   b    = (Mat_SeqBAIJ*)(fact)->data;
5461 
5462   /* allocate matrix arrays for new data structure */
5463   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5464   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5465   b->singlemalloc = PETSC_TRUE;
5466   if (!b->diag){
5467     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5468     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5469   }
5470   bdiag = b->diag;
5471 
5472   if (n > 0) {
5473     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5474   }
5475 
5476   /* set bi and bj with new data structure */
5477   bi = b->i;
5478   bj = b->j;
5479 
5480   /* L part */
5481   bi[0] = 0;
5482   for (i=0; i<n; i++){
5483     nz = adiag[i] - ai[i];
5484     bi[i+1] = bi[i] + nz;
5485     aj = a->j + ai[i];
5486     for (j=0; j<nz; j++){
5487       *bj = aj[j]; bj++;
5488     }
5489   }
5490 
5491   /* U part */
5492   bi_temp = bi[n];
5493   bdiag[n] = bi[n]-1;
5494   for (i=n-1; i>=0; i--){
5495     nz = ai[i+1] - adiag[i] - 1;
5496     bi_temp = bi_temp + nz + 1;
5497     aj = a->j + adiag[i] + 1;
5498     for (j=0; j<nz; j++){
5499       *bj = aj[j]; bj++;
5500     }
5501     /* diag[i] */
5502     *bj = i; bj++;
5503     bdiag[i] = bi_temp - 1;
5504   }
5505   PetscFunctionReturn(0);
5506 }
5507 
5508 #undef __FUNCT__
5509 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5510 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5511 {
5512   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5513   IS                 isicol;
5514   PetscErrorCode     ierr;
5515   const PetscInt     *r,*ic;
5516   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5517   PetscInt           *bi,*cols,nnz,*cols_lvl;
5518   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5519   PetscInt           i,levels,diagonal_fill;
5520   PetscTruth         col_identity,row_identity,both_identity;
5521   PetscReal          f;
5522   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5523   PetscBT            lnkbt;
5524   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5525   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5526   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5527   PetscTruth         missing;
5528   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5529   PetscTruth         olddatastruct = PETSC_FALSE;
5530 
5531   PetscFunctionBegin;
5532   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr);
5533   if (olddatastruct){
5534     ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5535     PetscFunctionReturn(0);
5536   }
5537   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5538   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5539   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5540 
5541   f             = info->fill;
5542   levels        = (PetscInt)info->levels;
5543   diagonal_fill = (PetscInt)info->diagonal_fill;
5544   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5545 
5546   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5547   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5548   both_identity = (PetscTruth) (row_identity && col_identity);
5549 
5550   if (!levels && both_identity) {
5551     /* special case: ilu(0) with natural ordering */
5552     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5553     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5554 
5555     fact->factor = MAT_FACTOR_ILU;
5556     (fact)->info.factor_mallocs    = 0;
5557     (fact)->info.fill_ratio_given  = info->fill;
5558     (fact)->info.fill_ratio_needed = 1.0;
5559     b                = (Mat_SeqBAIJ*)(fact)->data;
5560     b->row           = isrow;
5561     b->col           = iscol;
5562     b->icol          = isicol;
5563     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5564     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5565     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5566     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5567     PetscFunctionReturn(0);
5568   }
5569 
5570   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5571   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5572 
5573   /* get new row pointers */
5574   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5575   bi[0] = 0;
5576   /* bdiag is location of diagonal in factor */
5577   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5578   bdiag[0]  = 0;
5579 
5580   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5581 
5582   /* create a linked list for storing column indices of the active row */
5583   nlnk = n + 1;
5584   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5585 
5586   /* initial FreeSpace size is f*(ai[n]+1) */
5587   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5588   current_space = free_space;
5589   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5590   current_space_lvl = free_space_lvl;
5591 
5592   for (i=0; i<n; i++) {
5593     nzi = 0;
5594     /* copy current row into linked list */
5595     nnz  = ai[r[i]+1] - ai[r[i]];
5596     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5597     cols = aj + ai[r[i]];
5598     lnk[i] = -1; /* marker to indicate if diagonal exists */
5599     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5600     nzi += nlnk;
5601 
5602     /* make sure diagonal entry is included */
5603     if (diagonal_fill && lnk[i] == -1) {
5604       fm = n;
5605       while (lnk[fm] < i) fm = lnk[fm];
5606       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5607       lnk[fm]    = i;
5608       lnk_lvl[i] = 0;
5609       nzi++; dcount++;
5610     }
5611 
5612     /* add pivot rows into the active row */
5613     nzbd = 0;
5614     prow = lnk[n];
5615     while (prow < i) {
5616       nnz      = bdiag[prow];
5617       cols     = bj_ptr[prow] + nnz + 1;
5618       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5619       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5620       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5621       nzi += nlnk;
5622       prow = lnk[prow];
5623       nzbd++;
5624     }
5625     bdiag[i] = nzbd;
5626     bi[i+1]  = bi[i] + nzi;
5627 
5628     /* if free space is not available, make more free space */
5629     if (current_space->local_remaining<nzi) {
5630       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5631       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5632       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5633       reallocs++;
5634     }
5635 
5636     /* copy data into free_space and free_space_lvl, then initialize lnk */
5637     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5638     bj_ptr[i]    = current_space->array;
5639     bjlvl_ptr[i] = current_space_lvl->array;
5640 
5641     /* make sure the active row i has diagonal entry */
5642     if (*(bj_ptr[i]+bdiag[i]) != i) {
5643       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5644     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5645     }
5646 
5647     current_space->array           += nzi;
5648     current_space->local_used      += nzi;
5649     current_space->local_remaining -= nzi;
5650     current_space_lvl->array           += nzi;
5651     current_space_lvl->local_used      += nzi;
5652     current_space_lvl->local_remaining -= nzi;
5653   }
5654 
5655   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5656   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5657 
5658   /* destroy list of free space and other temporary arrays */
5659   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5660 
5661   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5662   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5663 
5664   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5665   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5666   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5667 
5668 #if defined(PETSC_USE_INFO)
5669   {
5670     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5671     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5672     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5673     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5674     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5675     if (diagonal_fill) {
5676       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5677     }
5678   }
5679 #endif
5680 
5681   /* put together the new matrix */
5682   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5683   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5684   b = (Mat_SeqBAIJ*)(fact)->data;
5685   b->free_a       = PETSC_TRUE;
5686   b->free_ij      = PETSC_TRUE;
5687   b->singlemalloc = PETSC_FALSE;
5688   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5689   b->j          = bj;
5690   b->i          = bi;
5691   b->diag       = bdiag;
5692   b->free_diag  = PETSC_TRUE;
5693   b->ilen       = 0;
5694   b->imax       = 0;
5695   b->row        = isrow;
5696   b->col        = iscol;
5697   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5698   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5699   b->icol       = isicol;
5700   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5701   /* In b structure:  Free imax, ilen, old a, old j.
5702      Allocate bdiag, solve_work, new a, new j */
5703   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5704   b->maxnz = b->nz = bdiag[0]+1;
5705   fact->info.factor_mallocs    = reallocs;
5706   fact->info.fill_ratio_given  = f;
5707   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5708   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5709   PetscFunctionReturn(0);
5710 }
5711 
5712 
5713 /*
5714      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5715    except that the data structure of Mat_SeqAIJ is slightly different.
5716    Not a good example of code reuse.
5717 */
5718 #undef __FUNCT__
5719 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5720 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5721 {
5722   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5723   IS             isicol;
5724   PetscErrorCode ierr;
5725   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5726   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5727   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5728   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5729   PetscTruth     col_identity,row_identity,both_identity,flg;
5730   PetscReal      f;
5731 
5732   PetscFunctionBegin;
5733   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5734   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5735 
5736   f             = info->fill;
5737   levels        = (PetscInt)info->levels;
5738   diagonal_fill = (PetscInt)info->diagonal_fill;
5739   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5740 
5741   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5742   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5743   both_identity = (PetscTruth) (row_identity && col_identity);
5744 
5745   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5746     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5747     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5748 
5749     fact->factor = MAT_FACTOR_ILU;
5750     b            = (Mat_SeqBAIJ*)fact->data;
5751     b->row       = isrow;
5752     b->col       = iscol;
5753     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5754     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5755     b->icol      = isicol;
5756     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5757     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5758     PetscFunctionReturn(0);
5759   }
5760 
5761   /* general case perform the symbolic factorization */
5762     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5763     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5764 
5765     /* get new row pointers */
5766     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5767     ainew[0] = 0;
5768     /* don't know how many column pointers are needed so estimate */
5769     jmax = (PetscInt)(f*ai[n] + 1);
5770     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5771     /* ajfill is level of fill for each fill entry */
5772     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5773     /* fill is a linked list of nonzeros in active row */
5774     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5775     /* im is level for each filled value */
5776     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5777     /* dloc is location of diagonal in factor */
5778     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5779     dloc[0]  = 0;
5780     for (prow=0; prow<n; prow++) {
5781 
5782       /* copy prow into linked list */
5783       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5784       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5785       xi         = aj + ai[r[prow]];
5786       fill[n]    = n;
5787       fill[prow] = -1; /* marker for diagonal entry */
5788       while (nz--) {
5789 	fm  = n;
5790 	idx = ic[*xi++];
5791 	do {
5792 	  m  = fm;
5793 	  fm = fill[m];
5794 	} while (fm < idx);
5795 	fill[m]   = idx;
5796 	fill[idx] = fm;
5797 	im[idx]   = 0;
5798       }
5799 
5800       /* make sure diagonal entry is included */
5801       if (diagonal_fill && fill[prow] == -1) {
5802 	fm = n;
5803 	while (fill[fm] < prow) fm = fill[fm];
5804 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5805 	fill[fm]   = prow;
5806 	im[prow]   = 0;
5807 	nzf++;
5808 	dcount++;
5809       }
5810 
5811       nzi = 0;
5812       row = fill[n];
5813       while (row < prow) {
5814 	incrlev = im[row] + 1;
5815 	nz      = dloc[row];
5816 	xi      = ajnew  + ainew[row] + nz + 1;
5817 	flev    = ajfill + ainew[row] + nz + 1;
5818 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5819 	fm      = row;
5820 	while (nnz-- > 0) {
5821 	  idx = *xi++;
5822 	  if (*flev + incrlev > levels) {
5823 	    flev++;
5824 	    continue;
5825 	  }
5826 	  do {
5827 	    m  = fm;
5828 	    fm = fill[m];
5829 	  } while (fm < idx);
5830 	  if (fm != idx) {
5831 	    im[idx]   = *flev + incrlev;
5832 	    fill[m]   = idx;
5833 	    fill[idx] = fm;
5834 	    fm        = idx;
5835 	    nzf++;
5836 	  } else {
5837 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5838 	  }
5839 	  flev++;
5840 	}
5841 	row = fill[row];
5842 	nzi++;
5843       }
5844       /* copy new filled row into permanent storage */
5845       ainew[prow+1] = ainew[prow] + nzf;
5846       if (ainew[prow+1] > jmax) {
5847 
5848 	/* estimate how much additional space we will need */
5849 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5850 	/* just double the memory each time */
5851 	PetscInt maxadd = jmax;
5852 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5853 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5854 	jmax += maxadd;
5855 
5856 	/* allocate a longer ajnew and ajfill */
5857 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5858 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5859 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5860 	ajnew = xitmp;
5861 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5862 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5863 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5864 	ajfill = xitmp;
5865 	reallocate++; /* count how many reallocations are needed */
5866       }
5867       xitmp       = ajnew + ainew[prow];
5868       flev        = ajfill + ainew[prow];
5869       dloc[prow]  = nzi;
5870       fm          = fill[n];
5871       while (nzf--) {
5872 	*xitmp++ = fm;
5873 	*flev++ = im[fm];
5874 	fm      = fill[fm];
5875       }
5876       /* make sure row has diagonal entry */
5877       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5878 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5879     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5880       }
5881     }
5882     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5883     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5884     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5885     ierr = PetscFree(fill);CHKERRQ(ierr);
5886     ierr = PetscFree(im);CHKERRQ(ierr);
5887 
5888 #if defined(PETSC_USE_INFO)
5889     {
5890       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5891       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5892       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5893       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5894       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5895       if (diagonal_fill) {
5896 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5897       }
5898     }
5899 #endif
5900 
5901     /* put together the new matrix */
5902     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5903     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5904     b    = (Mat_SeqBAIJ*)fact->data;
5905     b->free_a       = PETSC_TRUE;
5906     b->free_ij      = PETSC_TRUE;
5907     b->singlemalloc = PETSC_FALSE;
5908     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5909     b->j          = ajnew;
5910     b->i          = ainew;
5911     for (i=0; i<n; i++) dloc[i] += ainew[i];
5912     b->diag       = dloc;
5913     b->free_diag  = PETSC_TRUE;
5914     b->ilen       = 0;
5915     b->imax       = 0;
5916     b->row        = isrow;
5917     b->col        = iscol;
5918     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5919     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5920     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5921     b->icol       = isicol;
5922     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5923     /* In b structure:  Free imax, ilen, old a, old j.
5924        Allocate dloc, solve_work, new a, new j */
5925     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5926     b->maxnz          = b->nz = ainew[n];
5927 
5928     fact->info.factor_mallocs    = reallocate;
5929     fact->info.fill_ratio_given  = f;
5930     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5931 
5932   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5933   PetscFunctionReturn(0);
5934 }
5935 
5936 #undef __FUNCT__
5937 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5938 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5939 {
5940   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5941   /* int i,*AJ=a->j,nz=a->nz; */
5942   PetscFunctionBegin;
5943   /* Undo Column scaling */
5944 /*    while (nz--) { */
5945 /*      AJ[i] = AJ[i]/4; */
5946 /*    } */
5947   /* This should really invoke a push/pop logic, but we don't have that yet. */
5948   A->ops->setunfactored = PETSC_NULL;
5949   PetscFunctionReturn(0);
5950 }
5951 
5952 #undef __FUNCT__
5953 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5954 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5955 {
5956   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5957   PetscInt       *AJ=a->j,nz=a->nz;
5958   unsigned short *aj=(unsigned short *)AJ;
5959   PetscFunctionBegin;
5960   /* Is this really necessary? */
5961   while (nz--) {
5962     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5963   }
5964   A->ops->setunfactored = PETSC_NULL;
5965   PetscFunctionReturn(0);
5966 }
5967 
5968 
5969