xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 8499736ad707674778426e240b94526f94a8d49e)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt       nz,idx,idt,j,i,oidx;
125   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126   MatScalar      *aa=a->a,*v;
127   PetscScalar    s1,s2,x1,x2;
128   PetscScalar    *x,*b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,x1,x2,x3;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode ierr;
244   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt       nz,idx,idt,j,i,oidx;
246   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
247   MatScalar      *aa=a->a,*v;
248   PetscScalar    s1,s2,s3,x1,x2,x3;
249   PetscScalar    *x,*b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode ierr;
306   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   PetscInt       *diag = a->diag,oidx;
308   MatScalar      *aa=a->a,*v;
309   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
310   PetscScalar    *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode ierr;
371   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt       nz,idx,idt,j,i,oidx;
373   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
374   MatScalar      *aa=a->a,*v;
375   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
376   PetscScalar    *x,*b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode ierr;
436   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437   PetscInt       *diag = a->diag,oidx;
438   MatScalar      *aa=a->a,*v;
439   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
440   PetscScalar    *x,*b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
509   PetscScalar    *x,*b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode ierr;
573   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574   PetscInt       *diag = a->diag,oidx;
575   MatScalar      *aa=a->a,*v;
576   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
577   PetscScalar    *x,*b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode ierr;
647   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt       nz,idx,idt,j,i,oidx;
649   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
650   MatScalar      *aa=a->a,*v;
651   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
652   PetscScalar    *x,*b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode ierr;
721   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722   PetscInt       *diag = a->diag,oidx;
723   MatScalar      *aa=a->a,*v;
724   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
725   PetscScalar    *x,*b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode ierr;
797   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt       nz,idx,idt,j,i,oidx;
799   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
802   PetscScalar    *x,*b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873   IS             iscol=a->col,isrow=a->row;
874   PetscErrorCode ierr;
875   const PetscInt *r,*c,*rout,*cout;
876   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877   PetscInt       *diag = a->diag;
878   MatScalar      *aa=a->a,*v;
879   PetscScalar    s1,*x,*b,*t;
880 
881   PetscFunctionBegin;
882   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
883   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884   t  = a->solve_work;
885 
886   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888 
889   /* copy the b into temp work space according to permutation */
890   for (i=0; i<n; i++) {
891     t[i] = b[c[i]];
892   }
893 
894   /* forward solve the U^T */
895   for (i=0; i<n; i++) {
896 
897     v     = aa + diag[i];
898     /* multiply by the inverse of the block diagonal */
899     s1    = (*v++)*t[i];
900     vi    = aj + diag[i] + 1;
901     nz    = ai[i+1] - diag[i] - 1;
902     while (nz--) {
903       t[*vi++]  -= (*v++)*s1;
904     }
905     t[i]   = s1;
906   }
907   /* backward solve the L^T */
908   for (i=n-1; i>=0; i--){
909     v    = aa + diag[i] - 1;
910     vi   = aj + diag[i] - 1;
911     nz   = diag[i] - ai[i];
912     s1   = t[i];
913     while (nz--) {
914       t[*vi--]   -=  (*v--)*s1;
915     }
916   }
917 
918   /* copy t into x according to permutation */
919   for (i=0; i<n; i++) {
920     x[r[i]]   = t[i];
921   }
922 
923   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
925   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
926   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 #undef __FUNCT__
932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
934 {
935   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936   IS             iscol=a->col,isrow=a->row;
937   PetscErrorCode ierr;
938   const PetscInt *r,*c,*rout,*cout;
939   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941   MatScalar      *aa=a->a,*v;
942   PetscScalar    s1,s2,x1,x2;
943   PetscScalar    *x,*b,*t;
944 
945   PetscFunctionBegin;
946   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
947   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948   t  = a->solve_work;
949 
950   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952 
953   /* copy the b into temp work space according to permutation */
954   ii = 0;
955   for (i=0; i<n; i++) {
956     ic      = 2*c[i];
957     t[ii]   = b[ic];
958     t[ii+1] = b[ic+1];
959     ii += 2;
960   }
961 
962   /* forward solve the U^T */
963   idx = 0;
964   for (i=0; i<n; i++) {
965 
966     v     = aa + 4*diag[i];
967     /* multiply by the inverse of the block diagonal */
968     x1    = t[idx];   x2 = t[1+idx];
969     s1 = v[0]*x1  +  v[1]*x2;
970     s2 = v[2]*x1  +  v[3]*x2;
971     v += 4;
972 
973     vi    = aj + diag[i] + 1;
974     nz    = ai[i+1] - diag[i] - 1;
975     while (nz--) {
976       oidx = 2*(*vi++);
977       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979       v  += 4;
980     }
981     t[idx]   = s1;t[1+idx] = s2;
982     idx += 2;
983   }
984   /* backward solve the L^T */
985   for (i=n-1; i>=0; i--){
986     v    = aa + 4*diag[i] - 4;
987     vi   = aj + diag[i] - 1;
988     nz   = diag[i] - ai[i];
989     idt  = 2*i;
990     s1 = t[idt];  s2 = t[1+idt];
991     while (nz--) {
992       idx   = 2*(*vi--);
993       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995       v -= 4;
996     }
997   }
998 
999   /* copy t into x according to permutation */
1000   ii = 0;
1001   for (i=0; i<n; i++) {
1002     ir      = 2*r[i];
1003     x[ir]   = t[ii];
1004     x[ir+1] = t[ii+1];
1005     ii += 2;
1006   }
1007 
1008   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 #undef __FUNCT__
1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1019 {
1020   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1021   IS             iscol=a->col,isrow=a->row;
1022   PetscErrorCode ierr;
1023   const PetscInt *r,*c,*rout,*cout;
1024   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1025   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1026   MatScalar      *aa=a->a,*v;
1027   PetscScalar    s1,s2,s3,x1,x2,x3;
1028   PetscScalar    *x,*b,*t;
1029 
1030   PetscFunctionBegin;
1031   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1032   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1033   t  = a->solve_work;
1034 
1035   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1036   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1037 
1038   /* copy the b into temp work space according to permutation */
1039   ii = 0;
1040   for (i=0; i<n; i++) {
1041     ic      = 3*c[i];
1042     t[ii]   = b[ic];
1043     t[ii+1] = b[ic+1];
1044     t[ii+2] = b[ic+2];
1045     ii += 3;
1046   }
1047 
1048   /* forward solve the U^T */
1049   idx = 0;
1050   for (i=0; i<n; i++) {
1051 
1052     v     = aa + 9*diag[i];
1053     /* multiply by the inverse of the block diagonal */
1054     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1055     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1056     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1057     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1058     v += 9;
1059 
1060     vi    = aj + diag[i] + 1;
1061     nz    = ai[i+1] - diag[i] - 1;
1062     while (nz--) {
1063       oidx = 3*(*vi++);
1064       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1065       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1066       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1067       v  += 9;
1068     }
1069     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1070     idx += 3;
1071   }
1072   /* backward solve the L^T */
1073   for (i=n-1; i>=0; i--){
1074     v    = aa + 9*diag[i] - 9;
1075     vi   = aj + diag[i] - 1;
1076     nz   = diag[i] - ai[i];
1077     idt  = 3*i;
1078     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1079     while (nz--) {
1080       idx   = 3*(*vi--);
1081       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1082       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1083       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1084       v -= 9;
1085     }
1086   }
1087 
1088   /* copy t into x according to permutation */
1089   ii = 0;
1090   for (i=0; i<n; i++) {
1091     ir      = 3*r[i];
1092     x[ir]   = t[ii];
1093     x[ir+1] = t[ii+1];
1094     x[ir+2] = t[ii+2];
1095     ii += 3;
1096   }
1097 
1098   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1099   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1100   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1102   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1108 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1109 {
1110   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1111   IS             iscol=a->col,isrow=a->row;
1112   PetscErrorCode ierr;
1113   const PetscInt *r,*c,*rout,*cout;
1114   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1115   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1116   MatScalar      *aa=a->a,*v;
1117   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1118   PetscScalar    *x,*b,*t;
1119 
1120   PetscFunctionBegin;
1121   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1122   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1123   t  = a->solve_work;
1124 
1125   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1126   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1127 
1128   /* copy the b into temp work space according to permutation */
1129   ii = 0;
1130   for (i=0; i<n; i++) {
1131     ic      = 4*c[i];
1132     t[ii]   = b[ic];
1133     t[ii+1] = b[ic+1];
1134     t[ii+2] = b[ic+2];
1135     t[ii+3] = b[ic+3];
1136     ii += 4;
1137   }
1138 
1139   /* forward solve the U^T */
1140   idx = 0;
1141   for (i=0; i<n; i++) {
1142 
1143     v     = aa + 16*diag[i];
1144     /* multiply by the inverse of the block diagonal */
1145     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1146     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1147     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1148     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1149     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1150     v += 16;
1151 
1152     vi    = aj + diag[i] + 1;
1153     nz    = ai[i+1] - diag[i] - 1;
1154     while (nz--) {
1155       oidx = 4*(*vi++);
1156       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1157       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1158       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1159       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1160       v  += 16;
1161     }
1162     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1163     idx += 4;
1164   }
1165   /* backward solve the L^T */
1166   for (i=n-1; i>=0; i--){
1167     v    = aa + 16*diag[i] - 16;
1168     vi   = aj + diag[i] - 1;
1169     nz   = diag[i] - ai[i];
1170     idt  = 4*i;
1171     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1172     while (nz--) {
1173       idx   = 4*(*vi--);
1174       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1175       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1176       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1177       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1178       v -= 16;
1179     }
1180   }
1181 
1182   /* copy t into x according to permutation */
1183   ii = 0;
1184   for (i=0; i<n; i++) {
1185     ir      = 4*r[i];
1186     x[ir]   = t[ii];
1187     x[ir+1] = t[ii+1];
1188     x[ir+2] = t[ii+2];
1189     x[ir+3] = t[ii+3];
1190     ii += 4;
1191   }
1192 
1193   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1194   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1195   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1196   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1197   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 #undef __FUNCT__
1202 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1203 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1204 {
1205   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1206   IS             iscol=a->col,isrow=a->row;
1207   PetscErrorCode ierr;
1208   const PetscInt *r,*c,*rout,*cout;
1209   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1210   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1211   MatScalar      *aa=a->a,*v;
1212   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1213   PetscScalar    *x,*b,*t;
1214 
1215   PetscFunctionBegin;
1216   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1217   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1218   t  = a->solve_work;
1219 
1220   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1221   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1222 
1223   /* copy the b into temp work space according to permutation */
1224   ii = 0;
1225   for (i=0; i<n; i++) {
1226     ic      = 5*c[i];
1227     t[ii]   = b[ic];
1228     t[ii+1] = b[ic+1];
1229     t[ii+2] = b[ic+2];
1230     t[ii+3] = b[ic+3];
1231     t[ii+4] = b[ic+4];
1232     ii += 5;
1233   }
1234 
1235   /* forward solve the U^T */
1236   idx = 0;
1237   for (i=0; i<n; i++) {
1238 
1239     v     = aa + 25*diag[i];
1240     /* multiply by the inverse of the block diagonal */
1241     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1243     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1244     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1245     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1246     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1247     v += 25;
1248 
1249     vi    = aj + diag[i] + 1;
1250     nz    = ai[i+1] - diag[i] - 1;
1251     while (nz--) {
1252       oidx = 5*(*vi++);
1253       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1254       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1255       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1256       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1257       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1258       v  += 25;
1259     }
1260     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1261     idx += 5;
1262   }
1263   /* backward solve the L^T */
1264   for (i=n-1; i>=0; i--){
1265     v    = aa + 25*diag[i] - 25;
1266     vi   = aj + diag[i] - 1;
1267     nz   = diag[i] - ai[i];
1268     idt  = 5*i;
1269     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1270     while (nz--) {
1271       idx   = 5*(*vi--);
1272       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1273       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1274       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1275       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1276       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1277       v -= 25;
1278     }
1279   }
1280 
1281   /* copy t into x according to permutation */
1282   ii = 0;
1283   for (i=0; i<n; i++) {
1284     ir      = 5*r[i];
1285     x[ir]   = t[ii];
1286     x[ir+1] = t[ii+1];
1287     x[ir+2] = t[ii+2];
1288     x[ir+3] = t[ii+3];
1289     x[ir+4] = t[ii+4];
1290     ii += 5;
1291   }
1292 
1293   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1294   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1295   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1296   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1297   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1298   PetscFunctionReturn(0);
1299 }
1300 
1301 #undef __FUNCT__
1302 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1303 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1304 {
1305   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1306   IS             iscol=a->col,isrow=a->row;
1307   PetscErrorCode ierr;
1308   const PetscInt *r,*c,*rout,*cout;
1309   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1310   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1311   MatScalar      *aa=a->a,*v;
1312   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1313   PetscScalar    *x,*b,*t;
1314 
1315   PetscFunctionBegin;
1316   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1317   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1318   t  = a->solve_work;
1319 
1320   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1321   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1322 
1323   /* copy the b into temp work space according to permutation */
1324   ii = 0;
1325   for (i=0; i<n; i++) {
1326     ic      = 6*c[i];
1327     t[ii]   = b[ic];
1328     t[ii+1] = b[ic+1];
1329     t[ii+2] = b[ic+2];
1330     t[ii+3] = b[ic+3];
1331     t[ii+4] = b[ic+4];
1332     t[ii+5] = b[ic+5];
1333     ii += 6;
1334   }
1335 
1336   /* forward solve the U^T */
1337   idx = 0;
1338   for (i=0; i<n; i++) {
1339 
1340     v     = aa + 36*diag[i];
1341     /* multiply by the inverse of the block diagonal */
1342     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1343     x6    = t[5+idx];
1344     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1345     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1346     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1347     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1348     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1349     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1350     v += 36;
1351 
1352     vi    = aj + diag[i] + 1;
1353     nz    = ai[i+1] - diag[i] - 1;
1354     while (nz--) {
1355       oidx = 6*(*vi++);
1356       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1357       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1358       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1359       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1360       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1361       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1362       v  += 36;
1363     }
1364     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1365     t[5+idx] = s6;
1366     idx += 6;
1367   }
1368   /* backward solve the L^T */
1369   for (i=n-1; i>=0; i--){
1370     v    = aa + 36*diag[i] - 36;
1371     vi   = aj + diag[i] - 1;
1372     nz   = diag[i] - ai[i];
1373     idt  = 6*i;
1374     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1375     s6 = t[5+idt];
1376     while (nz--) {
1377       idx   = 6*(*vi--);
1378       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1379       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1380       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1381       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1382       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1383       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1384       v -= 36;
1385     }
1386   }
1387 
1388   /* copy t into x according to permutation */
1389   ii = 0;
1390   for (i=0; i<n; i++) {
1391     ir      = 6*r[i];
1392     x[ir]   = t[ii];
1393     x[ir+1] = t[ii+1];
1394     x[ir+2] = t[ii+2];
1395     x[ir+3] = t[ii+3];
1396     x[ir+4] = t[ii+4];
1397     x[ir+5] = t[ii+5];
1398     ii += 6;
1399   }
1400 
1401   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1402   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1403   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1404   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1405   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 #undef __FUNCT__
1410 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1411 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1412 {
1413   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1414   IS             iscol=a->col,isrow=a->row;
1415   PetscErrorCode ierr;
1416   const PetscInt *r,*c,*rout,*cout;
1417   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1418   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1419   MatScalar      *aa=a->a,*v;
1420   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1421   PetscScalar    *x,*b,*t;
1422 
1423   PetscFunctionBegin;
1424   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1425   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1426   t  = a->solve_work;
1427 
1428   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1429   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1430 
1431   /* copy the b into temp work space according to permutation */
1432   ii = 0;
1433   for (i=0; i<n; i++) {
1434     ic      = 7*c[i];
1435     t[ii]   = b[ic];
1436     t[ii+1] = b[ic+1];
1437     t[ii+2] = b[ic+2];
1438     t[ii+3] = b[ic+3];
1439     t[ii+4] = b[ic+4];
1440     t[ii+5] = b[ic+5];
1441     t[ii+6] = b[ic+6];
1442     ii += 7;
1443   }
1444 
1445   /* forward solve the U^T */
1446   idx = 0;
1447   for (i=0; i<n; i++) {
1448 
1449     v     = aa + 49*diag[i];
1450     /* multiply by the inverse of the block diagonal */
1451     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1452     x6    = t[5+idx]; x7 = t[6+idx];
1453     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1454     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1455     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1456     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1457     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1458     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1459     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1460     v += 49;
1461 
1462     vi    = aj + diag[i] + 1;
1463     nz    = ai[i+1] - diag[i] - 1;
1464     while (nz--) {
1465       oidx = 7*(*vi++);
1466       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1467       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1468       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1469       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1470       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1471       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1472       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1473       v  += 49;
1474     }
1475     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1476     t[5+idx] = s6;t[6+idx] = s7;
1477     idx += 7;
1478   }
1479   /* backward solve the L^T */
1480   for (i=n-1; i>=0; i--){
1481     v    = aa + 49*diag[i] - 49;
1482     vi   = aj + diag[i] - 1;
1483     nz   = diag[i] - ai[i];
1484     idt  = 7*i;
1485     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1486     s6 = t[5+idt];s7 = t[6+idt];
1487     while (nz--) {
1488       idx   = 7*(*vi--);
1489       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1490       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1491       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1492       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1493       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1494       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1495       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1496       v -= 49;
1497     }
1498   }
1499 
1500   /* copy t into x according to permutation */
1501   ii = 0;
1502   for (i=0; i<n; i++) {
1503     ir      = 7*r[i];
1504     x[ir]   = t[ii];
1505     x[ir+1] = t[ii+1];
1506     x[ir+2] = t[ii+2];
1507     x[ir+3] = t[ii+3];
1508     x[ir+4] = t[ii+4];
1509     x[ir+5] = t[ii+5];
1510     x[ir+6] = t[ii+6];
1511     ii += 7;
1512   }
1513 
1514   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1515   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1516   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1518   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 /* ----------------------------------------------------------- */
1523 #undef __FUNCT__
1524 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1525 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1526 {
1527   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1528   IS             iscol=a->col,isrow=a->row;
1529   PetscErrorCode ierr;
1530   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1531   PetscInt       i,n=a->mbs;
1532   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1533   MatScalar      *aa=a->a,*v;
1534   PetscScalar    *x,*b,*s,*t,*ls;
1535 
1536   PetscFunctionBegin;
1537   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1538   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1539   t  = a->solve_work;
1540 
1541   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1542   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1543 
1544   /* forward solve the lower triangular */
1545   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1546   for (i=1; i<n; i++) {
1547     v   = aa + bs2*ai[i];
1548     vi  = aj + ai[i];
1549     nz  = a->diag[i] - ai[i];
1550     s = t + bs*i;
1551     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1552     while (nz--) {
1553       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1554       v += bs2;
1555     }
1556   }
1557   /* backward solve the upper triangular */
1558   ls = a->solve_work + A->cmap->n;
1559   for (i=n-1; i>=0; i--){
1560     v   = aa + bs2*(a->diag[i] + 1);
1561     vi  = aj + a->diag[i] + 1;
1562     nz  = ai[i+1] - a->diag[i] - 1;
1563     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1564     while (nz--) {
1565       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1566       v += bs2;
1567     }
1568     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1569     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1570   }
1571 
1572   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1573   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1574   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1575   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1576   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 /* ----------------------------------------------------------- */
1581 #undef __FUNCT__
1582 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
1583 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1584 {
1585   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1586   IS                iscol=a->col,isrow=a->row;
1587   PetscErrorCode    ierr;
1588   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1589   PetscInt          i,n=a->mbs,j;
1590   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1591   const MatScalar   *aa=a->a,*v;
1592   PetscScalar       *x,*t,*ls;
1593   const PetscScalar *b;
1594   PetscFunctionBegin;
1595   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1597   t    = a->solve_work;
1598 
1599   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1600   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1601 
1602   /* copy the b into temp work space according to permutation */
1603   for (i=0; i<n; i++) {
1604     for (j=0; j<bs; j++) {
1605       t[i*bs+j] = b[c[i]*bs+j];
1606     }
1607   }
1608 
1609 
1610   /* forward solve the upper triangular transpose */
1611   ls = a->solve_work + A->cmap->n;
1612   for (i=0; i<n; i++){
1613     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1614     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1615     v   = aa + bs2*(a->diag[i] + 1);
1616     vi  = aj + a->diag[i] + 1;
1617     nz  = ai[i+1] - a->diag[i] - 1;
1618     while (nz--) {
1619       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1620       v += bs2;
1621     }
1622   }
1623 
1624   /* backward solve the lower triangular transpose */
1625   for (i=n-1; i>=0; i--) {
1626     v   = aa + bs2*ai[i];
1627     vi  = aj + ai[i];
1628     nz  = a->diag[i] - ai[i];
1629     while (nz--) {
1630       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
1631       v += bs2;
1632     }
1633   }
1634 
1635   /* copy t into x according to permutation */
1636   for (i=0; i<n; i++) {
1637     for (j=0; j<bs; j++) {
1638       x[bs*r[i]+j]   = t[bs*i+j];
1639     }
1640   }
1641 
1642   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1643   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1644   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1645   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1646   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 #undef __FUNCT__
1651 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct"
1652 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
1653 {
1654   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1655   IS                iscol=a->col,isrow=a->row;
1656   PetscErrorCode    ierr;
1657   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1658   PetscInt          i,n=a->mbs,j;
1659   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1660   const MatScalar   *aa=a->a,*v;
1661   PetscScalar       *x,*t,*ls;
1662   const PetscScalar *b;
1663   PetscFunctionBegin;
1664   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1665   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1666   t    = a->solve_work;
1667 
1668   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1669   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1670 
1671   /* copy the b into temp work space according to permutation */
1672   for (i=0; i<n; i++) {
1673     for (j=0; j<bs; j++) {
1674       t[i*bs+j] = b[c[i]*bs+j];
1675     }
1676   }
1677 
1678 
1679   /* forward solve the upper triangular transpose */
1680   ls = a->solve_work + A->cmap->n;
1681   for (i=0; i<n; i++){
1682     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1683     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1684     v   = aa + bs2*(diag[i] - 1);
1685     vi  = aj + diag[i] - 1;
1686     nz  = diag[i] - diag[i+1] - 1;
1687     for(j=0;j>-nz;j--){
1688       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1689       v -= bs2;
1690     }
1691   }
1692 
1693   /* backward solve the lower triangular transpose */
1694   for (i=n-1; i>=0; i--) {
1695     v   = aa + bs2*ai[i];
1696     vi  = aj + ai[i];
1697     nz  = ai[i+1] - ai[i];
1698     for(j=0;j<nz;j++){
1699       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1700       v += bs2;
1701     }
1702   }
1703 
1704   /* copy t into x according to permutation */
1705   for (i=0; i<n; i++) {
1706     for (j=0; j<bs; j++) {
1707       x[bs*r[i]+j]   = t[bs*i+j];
1708     }
1709   }
1710 
1711   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1712   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1713   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1714   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1715   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1716   PetscFunctionReturn(0);
1717 }
1718 
1719 #undef __FUNCT__
1720 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1721 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1722 {
1723   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1724   IS             iscol=a->col,isrow=a->row;
1725   PetscErrorCode ierr;
1726   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1727   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1728   MatScalar      *aa=a->a,*v;
1729   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1730   PetscScalar    *x,*b,*t;
1731 
1732   PetscFunctionBegin;
1733   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1734   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1735   t  = a->solve_work;
1736 
1737   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1738   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1739 
1740   /* forward solve the lower triangular */
1741   idx    = 7*(*r++);
1742   t[0] = b[idx];   t[1] = b[1+idx];
1743   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1744   t[5] = b[5+idx]; t[6] = b[6+idx];
1745 
1746   for (i=1; i<n; i++) {
1747     v     = aa + 49*ai[i];
1748     vi    = aj + ai[i];
1749     nz    = diag[i] - ai[i];
1750     idx   = 7*(*r++);
1751     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1752     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1753     while (nz--) {
1754       idx   = 7*(*vi++);
1755       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1756       x4    = t[3+idx];x5 = t[4+idx];
1757       x6    = t[5+idx];x7 = t[6+idx];
1758       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1759       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1760       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1761       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1762       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1763       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1764       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1765       v += 49;
1766     }
1767     idx = 7*i;
1768     t[idx]   = s1;t[1+idx] = s2;
1769     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1770     t[5+idx] = s6;t[6+idx] = s7;
1771   }
1772   /* backward solve the upper triangular */
1773   for (i=n-1; i>=0; i--){
1774     v    = aa + 49*diag[i] + 49;
1775     vi   = aj + diag[i] + 1;
1776     nz   = ai[i+1] - diag[i] - 1;
1777     idt  = 7*i;
1778     s1 = t[idt];  s2 = t[1+idt];
1779     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1780     s6 = t[5+idt];s7 = t[6+idt];
1781     while (nz--) {
1782       idx   = 7*(*vi++);
1783       x1    = t[idx];   x2 = t[1+idx];
1784       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1785       x6    = t[5+idx]; x7 = t[6+idx];
1786       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1787       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1788       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1789       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1790       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1791       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1792       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1793       v += 49;
1794     }
1795     idc = 7*(*c--);
1796     v   = aa + 49*diag[i];
1797     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1798                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1799     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1800                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1801     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1802                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1803     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1804                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1805     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1806                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1807     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1808                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1809     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1810                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1811   }
1812 
1813   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1814   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1815   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1816   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1817   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 #undef __FUNCT__
1822 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1823 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1824 {
1825   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1826   IS             iscol=a->col,isrow=a->row;
1827   PetscErrorCode ierr;
1828   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1829   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1830   MatScalar      *aa=a->a,*v;
1831   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1832   PetscScalar    *x,*b,*t;
1833 
1834   PetscFunctionBegin;
1835   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1836   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1837   t  = a->solve_work;
1838 
1839   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1840   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1841 
1842   /* forward solve the lower triangular */
1843   idx    = 7*r[0];
1844   t[0] = b[idx];   t[1] = b[1+idx];
1845   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1846   t[5] = b[5+idx]; t[6] = b[6+idx];
1847 
1848   for (i=1; i<n; i++) {
1849     v     = aa + 49*ai[i];
1850     vi    = aj + ai[i];
1851     nz    = ai[i+1] - ai[i];
1852     idx   = 7*r[i];
1853     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1854     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1855     for(m=0;m<nz;m++){
1856       idx   = 7*vi[m];
1857       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1858       x4    = t[3+idx];x5 = t[4+idx];
1859       x6    = t[5+idx];x7 = t[6+idx];
1860       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1861       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1862       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1863       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1864       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1865       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1866       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1867       v += 49;
1868     }
1869     idx = 7*i;
1870     t[idx]   = s1;t[1+idx] = s2;
1871     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1872     t[5+idx] = s6;t[6+idx] = s7;
1873   }
1874   /* backward solve the upper triangular */
1875   for (i=n-1; i>=0; i--){
1876     v    = aa + 49*(adiag[i+1]+1);
1877     vi   = aj + adiag[i+1]+1;
1878     nz   = adiag[i] - adiag[i+1] - 1;
1879     idt  = 7*i;
1880     s1 = t[idt];  s2 = t[1+idt];
1881     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1882     s6 = t[5+idt];s7 = t[6+idt];
1883     for(m=0;m<nz;m++){
1884       idx   = 7*vi[m];
1885       x1    = t[idx];   x2 = t[1+idx];
1886       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1887       x6    = t[5+idx]; x7 = t[6+idx];
1888       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1889       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1890       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1891       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1892       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1893       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1894       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1895       v += 49;
1896     }
1897     idc = 7*c[i];
1898     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1899                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1900     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1901                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1902     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1903                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1904     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1905                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1906     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1907                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1908     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1909                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1910     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1911                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1912   }
1913 
1914   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1915   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1916   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1917   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1918   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1919   PetscFunctionReturn(0);
1920 }
1921 
1922 #undef __FUNCT__
1923 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1924 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1925 {
1926   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1927   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1928   PetscErrorCode    ierr;
1929   PetscInt          *diag = a->diag,jdx;
1930   const MatScalar   *aa=a->a,*v;
1931   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1932   const PetscScalar *b;
1933 
1934   PetscFunctionBegin;
1935   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1936   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1937   /* forward solve the lower triangular */
1938   idx    = 0;
1939   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1940   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1941   x[6] = b[6+idx];
1942   for (i=1; i<n; i++) {
1943     v     =  aa + 49*ai[i];
1944     vi    =  aj + ai[i];
1945     nz    =  diag[i] - ai[i];
1946     idx   =  7*i;
1947     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1948     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1949     s7  =  b[6+idx];
1950     while (nz--) {
1951       jdx   = 7*(*vi++);
1952       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1953       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1954       x7    = x[6+jdx];
1955       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1956       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1957       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1958       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1959       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1960       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1961       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1962       v += 49;
1963      }
1964     x[idx]   = s1;
1965     x[1+idx] = s2;
1966     x[2+idx] = s3;
1967     x[3+idx] = s4;
1968     x[4+idx] = s5;
1969     x[5+idx] = s6;
1970     x[6+idx] = s7;
1971   }
1972   /* backward solve the upper triangular */
1973   for (i=n-1; i>=0; i--){
1974     v    = aa + 49*diag[i] + 49;
1975     vi   = aj + diag[i] + 1;
1976     nz   = ai[i+1] - diag[i] - 1;
1977     idt  = 7*i;
1978     s1 = x[idt];   s2 = x[1+idt];
1979     s3 = x[2+idt]; s4 = x[3+idt];
1980     s5 = x[4+idt]; s6 = x[5+idt];
1981     s7 = x[6+idt];
1982     while (nz--) {
1983       idx   = 7*(*vi++);
1984       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1985       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1986       x7    = x[6+idx];
1987       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1988       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1989       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1990       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1991       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1992       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1993       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1994       v += 49;
1995     }
1996     v        = aa + 49*diag[i];
1997     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1998                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1999     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2000                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2001     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2002                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2003     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2004                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2005     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2006                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2007     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2008                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2009     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2010                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2011   }
2012 
2013   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2014   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2015   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2016   PetscFunctionReturn(0);
2017 }
2018 
2019 #undef __FUNCT__
2020 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
2021 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2022 {
2023     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2024     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2025     PetscErrorCode    ierr;
2026     PetscInt          idx,jdx,idt;
2027     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2028     const MatScalar   *aa=a->a,*v;
2029     PetscScalar       *x;
2030     const PetscScalar *b;
2031     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2032 
2033     PetscFunctionBegin;
2034     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2035     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2036     /* forward solve the lower triangular */
2037     idx    = 0;
2038     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2039     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2040     for (i=1; i<n; i++) {
2041        v    = aa + bs2*ai[i];
2042        vi   = aj + ai[i];
2043        nz   = ai[i+1] - ai[i];
2044       idx   = bs*i;
2045        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2046        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2047        for(k=0;k<nz;k++) {
2048           jdx   = bs*vi[k];
2049           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2050 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2051           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2052           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2053           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2054 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2055           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2056 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2057 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2058           v   +=  bs2;
2059         }
2060 
2061        x[idx]   = s1;
2062        x[1+idx] = s2;
2063        x[2+idx] = s3;
2064        x[3+idx] = s4;
2065        x[4+idx] = s5;
2066        x[5+idx] = s6;
2067        x[6+idx] = s7;
2068     }
2069 
2070    /* backward solve the upper triangular */
2071   for (i=n-1; i>=0; i--){
2072     v   = aa + bs2*(adiag[i+1]+1);
2073      vi  = aj + adiag[i+1]+1;
2074      nz  = adiag[i] - adiag[i+1]-1;
2075      idt = bs*i;
2076      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2077      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2078     for(k=0;k<nz;k++) {
2079       idx   = bs*vi[k];
2080        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2081        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2082        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2083        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2084        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2085        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2086        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2087        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2088        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2089         v   +=  bs2;
2090     }
2091     /* x = inv_diagonal*x */
2092     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2093     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2094     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2095     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2096     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2097     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2098     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2099   }
2100 
2101   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2102   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2103   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2104   PetscFunctionReturn(0);
2105 }
2106 
2107 #undef __FUNCT__
2108 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2109 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2110 {
2111   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2112   IS                iscol=a->col,isrow=a->row;
2113   PetscErrorCode    ierr;
2114   const PetscInt    *r,*c,*rout,*cout;
2115   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2116   const MatScalar   *aa=a->a,*v;
2117   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2118   const PetscScalar *b;
2119   PetscFunctionBegin;
2120   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2121   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122   t  = a->solve_work;
2123 
2124   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2125   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2126 
2127   /* forward solve the lower triangular */
2128   idx    = 6*(*r++);
2129   t[0] = b[idx];   t[1] = b[1+idx];
2130   t[2] = b[2+idx]; t[3] = b[3+idx];
2131   t[4] = b[4+idx]; t[5] = b[5+idx];
2132   for (i=1; i<n; i++) {
2133     v     = aa + 36*ai[i];
2134     vi    = aj + ai[i];
2135     nz    = diag[i] - ai[i];
2136     idx   = 6*(*r++);
2137     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2138     s5  = b[4+idx]; s6 = b[5+idx];
2139     while (nz--) {
2140       idx   = 6*(*vi++);
2141       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2142       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2143       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2144       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2145       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2146       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2147       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2148       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2149       v += 36;
2150     }
2151     idx = 6*i;
2152     t[idx]   = s1;t[1+idx] = s2;
2153     t[2+idx] = s3;t[3+idx] = s4;
2154     t[4+idx] = s5;t[5+idx] = s6;
2155   }
2156   /* backward solve the upper triangular */
2157   for (i=n-1; i>=0; i--){
2158     v    = aa + 36*diag[i] + 36;
2159     vi   = aj + diag[i] + 1;
2160     nz   = ai[i+1] - diag[i] - 1;
2161     idt  = 6*i;
2162     s1 = t[idt];  s2 = t[1+idt];
2163     s3 = t[2+idt];s4 = t[3+idt];
2164     s5 = t[4+idt];s6 = t[5+idt];
2165     while (nz--) {
2166       idx   = 6*(*vi++);
2167       x1    = t[idx];   x2 = t[1+idx];
2168       x3    = t[2+idx]; x4 = t[3+idx];
2169       x5    = t[4+idx]; x6 = t[5+idx];
2170       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2171       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2172       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2173       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2174       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2175       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2176       v += 36;
2177     }
2178     idc = 6*(*c--);
2179     v   = aa + 36*diag[i];
2180     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2181                                  v[18]*s4+v[24]*s5+v[30]*s6;
2182     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2183                                  v[19]*s4+v[25]*s5+v[31]*s6;
2184     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2185                                  v[20]*s4+v[26]*s5+v[32]*s6;
2186     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2187                                  v[21]*s4+v[27]*s5+v[33]*s6;
2188     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2189                                  v[22]*s4+v[28]*s5+v[34]*s6;
2190     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2191                                  v[23]*s4+v[29]*s5+v[35]*s6;
2192   }
2193 
2194   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2195   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2196   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2197   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2198   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2199   PetscFunctionReturn(0);
2200 }
2201 
2202 #undef __FUNCT__
2203 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2204 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
2205 {
2206   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2207   IS                iscol=a->col,isrow=a->row;
2208   PetscErrorCode    ierr;
2209   const PetscInt    *r,*c,*rout,*cout;
2210   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2211   const MatScalar   *aa=a->a,*v;
2212   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2213   const PetscScalar *b;
2214   PetscFunctionBegin;
2215   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2216   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2217   t  = a->solve_work;
2218 
2219   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2220   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2221 
2222   /* forward solve the lower triangular */
2223   idx    = 6*r[0];
2224   t[0] = b[idx];   t[1] = b[1+idx];
2225   t[2] = b[2+idx]; t[3] = b[3+idx];
2226   t[4] = b[4+idx]; t[5] = b[5+idx];
2227   for (i=1; i<n; i++) {
2228     v     = aa + 36*ai[i];
2229     vi    = aj + ai[i];
2230     nz    = ai[i+1] - ai[i];
2231     idx   = 6*r[i];
2232     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2233     s5  = b[4+idx]; s6 = b[5+idx];
2234     for(m=0;m<nz;m++){
2235       idx   = 6*vi[m];
2236       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2237       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2238       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2239       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2240       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2241       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2242       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2243       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2244       v += 36;
2245     }
2246     idx = 6*i;
2247     t[idx]   = s1;t[1+idx] = s2;
2248     t[2+idx] = s3;t[3+idx] = s4;
2249     t[4+idx] = s5;t[5+idx] = s6;
2250   }
2251   /* backward solve the upper triangular */
2252   for (i=n-1; i>=0; i--){
2253     v    = aa + 36*(adiag[i+1]+1);
2254     vi   = aj + adiag[i+1]+1;
2255     nz   = adiag[i] - adiag[i+1] - 1;
2256     idt  = 6*i;
2257     s1 = t[idt];  s2 = t[1+idt];
2258     s3 = t[2+idt];s4 = t[3+idt];
2259     s5 = t[4+idt];s6 = t[5+idt];
2260     for(m=0;m<nz;m++){
2261       idx   = 6*vi[m];
2262       x1    = t[idx];   x2 = t[1+idx];
2263       x3    = t[2+idx]; x4 = t[3+idx];
2264       x5    = t[4+idx]; x6 = t[5+idx];
2265       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2266       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2267       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2268       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2269       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2270       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2271       v += 36;
2272     }
2273     idc = 6*c[i];
2274     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2275                                  v[18]*s4+v[24]*s5+v[30]*s6;
2276     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2277                                  v[19]*s4+v[25]*s5+v[31]*s6;
2278     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2279                                  v[20]*s4+v[26]*s5+v[32]*s6;
2280     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2281                                  v[21]*s4+v[27]*s5+v[33]*s6;
2282     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2283                                  v[22]*s4+v[28]*s5+v[34]*s6;
2284     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2285                                  v[23]*s4+v[29]*s5+v[35]*s6;
2286   }
2287 
2288   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2289   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2290   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2291   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2292   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2293   PetscFunctionReturn(0);
2294 }
2295 
2296 #undef __FUNCT__
2297 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2298 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
2299 {
2300   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2301   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2302   PetscErrorCode    ierr;
2303   PetscInt          *diag = a->diag,jdx;
2304   const MatScalar   *aa=a->a,*v;
2305   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2306   const PetscScalar *b;
2307 
2308   PetscFunctionBegin;
2309   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2310   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2311   /* forward solve the lower triangular */
2312   idx    = 0;
2313   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2314   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2315   for (i=1; i<n; i++) {
2316     v     =  aa + 36*ai[i];
2317     vi    =  aj + ai[i];
2318     nz    =  diag[i] - ai[i];
2319     idx   =  6*i;
2320     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2321     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2322     while (nz--) {
2323       jdx   = 6*(*vi++);
2324       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2325       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2326       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2327       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2328       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2329       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2330       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2331       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2332       v += 36;
2333      }
2334     x[idx]   = s1;
2335     x[1+idx] = s2;
2336     x[2+idx] = s3;
2337     x[3+idx] = s4;
2338     x[4+idx] = s5;
2339     x[5+idx] = s6;
2340   }
2341   /* backward solve the upper triangular */
2342   for (i=n-1; i>=0; i--){
2343     v    = aa + 36*diag[i] + 36;
2344     vi   = aj + diag[i] + 1;
2345     nz   = ai[i+1] - diag[i] - 1;
2346     idt  = 6*i;
2347     s1 = x[idt];   s2 = x[1+idt];
2348     s3 = x[2+idt]; s4 = x[3+idt];
2349     s5 = x[4+idt]; s6 = x[5+idt];
2350     while (nz--) {
2351       idx   = 6*(*vi++);
2352       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2353       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2354       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2355       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2356       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2357       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2358       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2359       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2360       v += 36;
2361     }
2362     v        = aa + 36*diag[i];
2363     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2364     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2365     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2366     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2367     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2368     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2369   }
2370 
2371   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2373   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2374   PetscFunctionReturn(0);
2375 }
2376 
2377 #undef __FUNCT__
2378 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2379 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2380 {
2381     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2382     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2383     PetscErrorCode    ierr;
2384     PetscInt          idx,jdx,idt;
2385     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2386     const MatScalar   *aa=a->a,*v;
2387     PetscScalar       *x;
2388     const PetscScalar *b;
2389     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2390 
2391     PetscFunctionBegin;
2392     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2393     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2394     /* forward solve the lower triangular */
2395     idx    = 0;
2396     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2397     x[4] = b[4+idx];x[5] = b[5+idx];
2398     for (i=1; i<n; i++) {
2399        v    = aa + bs2*ai[i];
2400        vi   = aj + ai[i];
2401        nz   = ai[i+1] - ai[i];
2402       idx   = bs*i;
2403        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2404        s5   = b[4+idx];s6 = b[5+idx];
2405        for(k=0;k<nz;k++){
2406           jdx   = bs*vi[k];
2407           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2408 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2409           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2410           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2411           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2412 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2413           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2414 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2415           v   +=  bs2;
2416         }
2417 
2418        x[idx]   = s1;
2419        x[1+idx] = s2;
2420        x[2+idx] = s3;
2421        x[3+idx] = s4;
2422        x[4+idx] = s5;
2423        x[5+idx] = s6;
2424     }
2425 
2426    /* backward solve the upper triangular */
2427   for (i=n-1; i>=0; i--){
2428     v   = aa + bs2*(adiag[i+1]+1);
2429      vi  = aj + adiag[i+1]+1;
2430      nz  = adiag[i] - adiag[i+1]-1;
2431      idt = bs*i;
2432      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2433      s5 = x[4+idt];s6 = x[5+idt];
2434      for(k=0;k<nz;k++){
2435       idx   = bs*vi[k];
2436        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2437        x5    = x[4+idx];x6 = x[5+idx];
2438        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2439        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2440        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2441        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2442        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2443        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2444         v   +=  bs2;
2445     }
2446     /* x = inv_diagonal*x */
2447    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2448    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2449    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2450    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2451    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2452    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2453   }
2454 
2455   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2457   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2458   PetscFunctionReturn(0);
2459 }
2460 
2461 #undef __FUNCT__
2462 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2463 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2464 {
2465   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2466   IS                iscol=a->col,isrow=a->row;
2467   PetscErrorCode    ierr;
2468   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2469   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2470   const MatScalar   *aa=a->a,*v;
2471   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2472   const PetscScalar *b;
2473 
2474   PetscFunctionBegin;
2475   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2476   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2477   t  = a->solve_work;
2478 
2479   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2480   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2481 
2482   /* forward solve the lower triangular */
2483   idx    = 5*(*r++);
2484   t[0] = b[idx];   t[1] = b[1+idx];
2485   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2486   for (i=1; i<n; i++) {
2487     v     = aa + 25*ai[i];
2488     vi    = aj + ai[i];
2489     nz    = diag[i] - ai[i];
2490     idx   = 5*(*r++);
2491     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2492     s5  = b[4+idx];
2493     while (nz--) {
2494       idx   = 5*(*vi++);
2495       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2496       x4    = t[3+idx];x5 = t[4+idx];
2497       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2498       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2499       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2500       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2501       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2502       v += 25;
2503     }
2504     idx = 5*i;
2505     t[idx]   = s1;t[1+idx] = s2;
2506     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2507   }
2508   /* backward solve the upper triangular */
2509   for (i=n-1; i>=0; i--){
2510     v    = aa + 25*diag[i] + 25;
2511     vi   = aj + diag[i] + 1;
2512     nz   = ai[i+1] - diag[i] - 1;
2513     idt  = 5*i;
2514     s1 = t[idt];  s2 = t[1+idt];
2515     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2516     while (nz--) {
2517       idx   = 5*(*vi++);
2518       x1    = t[idx];   x2 = t[1+idx];
2519       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2520       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2521       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2522       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2523       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2524       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2525       v += 25;
2526     }
2527     idc = 5*(*c--);
2528     v   = aa + 25*diag[i];
2529     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2530                                  v[15]*s4+v[20]*s5;
2531     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2532                                  v[16]*s4+v[21]*s5;
2533     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2534                                  v[17]*s4+v[22]*s5;
2535     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2536                                  v[18]*s4+v[23]*s5;
2537     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2538                                  v[19]*s4+v[24]*s5;
2539   }
2540 
2541   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2542   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2543   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2545   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2546   PetscFunctionReturn(0);
2547 }
2548 
2549 #undef __FUNCT__
2550 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2551 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
2552 {
2553   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2554   IS                iscol=a->col,isrow=a->row;
2555   PetscErrorCode    ierr;
2556   const PetscInt    *r,*c,*rout,*cout;
2557   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2558   const MatScalar   *aa=a->a,*v;
2559   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2560   const PetscScalar *b;
2561 
2562   PetscFunctionBegin;
2563   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2565   t  = a->solve_work;
2566 
2567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2569 
2570   /* forward solve the lower triangular */
2571   idx    = 5*r[0];
2572   t[0] = b[idx];   t[1] = b[1+idx];
2573   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2574   for (i=1; i<n; i++) {
2575     v     = aa + 25*ai[i];
2576     vi    = aj + ai[i];
2577     nz    = ai[i+1] - ai[i];
2578     idx   = 5*r[i];
2579     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2580     s5  = b[4+idx];
2581     for(m=0;m<nz;m++){
2582       idx   = 5*vi[m];
2583       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2584       x4    = t[3+idx];x5 = t[4+idx];
2585       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2586       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2587       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2588       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2589       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2590       v += 25;
2591     }
2592     idx = 5*i;
2593     t[idx]   = s1;t[1+idx] = s2;
2594     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2595   }
2596   /* backward solve the upper triangular */
2597   for (i=n-1; i>=0; i--){
2598     v    = aa + 25*(adiag[i+1]+1);
2599     vi   = aj + adiag[i+1]+1;
2600     nz   = adiag[i] - adiag[i+1] - 1;
2601     idt  = 5*i;
2602     s1 = t[idt];  s2 = t[1+idt];
2603     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2604     for(m=0;m<nz;m++){
2605       idx   = 5*vi[m];
2606       x1    = t[idx];   x2 = t[1+idx];
2607       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2608       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2609       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2610       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2611       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2612       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2613       v += 25;
2614     }
2615     idc = 5*c[i];
2616     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2617                                  v[15]*s4+v[20]*s5;
2618     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2619                                  v[16]*s4+v[21]*s5;
2620     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2621                                  v[17]*s4+v[22]*s5;
2622     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2623                                  v[18]*s4+v[23]*s5;
2624     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2625                                  v[19]*s4+v[24]*s5;
2626   }
2627 
2628   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2629   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2630   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2631   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2632   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2633   PetscFunctionReturn(0);
2634 }
2635 
2636 #undef __FUNCT__
2637 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2638 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
2639 {
2640   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2641   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2642   PetscErrorCode    ierr;
2643   PetscInt          *diag = a->diag,jdx;
2644   const MatScalar   *aa=a->a,*v;
2645   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2646   const PetscScalar *b;
2647 
2648   PetscFunctionBegin;
2649   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2650   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2651   /* forward solve the lower triangular */
2652   idx    = 0;
2653   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2654   for (i=1; i<n; i++) {
2655     v     =  aa + 25*ai[i];
2656     vi    =  aj + ai[i];
2657     nz    =  diag[i] - ai[i];
2658     idx   =  5*i;
2659     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2660     while (nz--) {
2661       jdx   = 5*(*vi++);
2662       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2663       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2664       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2665       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2666       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2667       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2668       v    += 25;
2669     }
2670     x[idx]   = s1;
2671     x[1+idx] = s2;
2672     x[2+idx] = s3;
2673     x[3+idx] = s4;
2674     x[4+idx] = s5;
2675   }
2676   /* backward solve the upper triangular */
2677   for (i=n-1; i>=0; i--){
2678     v    = aa + 25*diag[i] + 25;
2679     vi   = aj + diag[i] + 1;
2680     nz   = ai[i+1] - diag[i] - 1;
2681     idt  = 5*i;
2682     s1 = x[idt];  s2 = x[1+idt];
2683     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2684     while (nz--) {
2685       idx   = 5*(*vi++);
2686       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2687       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2688       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2689       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2690       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2691       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2692       v    += 25;
2693     }
2694     v        = aa + 25*diag[i];
2695     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2696     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2697     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2698     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2699     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2700   }
2701 
2702   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2704   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 #undef __FUNCT__
2709 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2710 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2711 {
2712   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2713   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
2714   PetscErrorCode    ierr;
2715   PetscInt          jdx;
2716   const MatScalar   *aa=a->a,*v;
2717   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2718   const PetscScalar *b;
2719 
2720   PetscFunctionBegin;
2721   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2722   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2723   /* forward solve the lower triangular */
2724   idx    = 0;
2725   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2726   for (i=1; i<n; i++) {
2727     v   = aa + 25*ai[i];
2728     vi  = aj + ai[i];
2729     nz  = ai[i+1] - ai[i];
2730     idx = 5*i;
2731     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
2732     for(k=0;k<nz;k++) {
2733       jdx   = 5*vi[k];
2734       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2735       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2736       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2737       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2738       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2739       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2740       v    += 25;
2741     }
2742     x[idx]   = s1;
2743     x[1+idx] = s2;
2744     x[2+idx] = s3;
2745     x[3+idx] = s4;
2746     x[4+idx] = s5;
2747   }
2748 
2749   /* backward solve the upper triangular */
2750   for (i=n-1; i>=0; i--){
2751     v   = aa + 25*(adiag[i+1]+1);
2752     vi  = aj + adiag[i+1]+1;
2753     nz  = adiag[i] - adiag[i+1]-1;
2754     idt = 5*i;
2755     s1 = x[idt];  s2 = x[1+idt];
2756     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
2757     for(k=0;k<nz;k++){
2758       idx   = 5*vi[k];
2759       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2760       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2761       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2762       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2763       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2764       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2765       v    += 25;
2766     }
2767     /* x = inv_diagonal*x */
2768     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2769     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2770     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2771     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2772     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2773   }
2774 
2775   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2776   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2777   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2778   PetscFunctionReturn(0);
2779 }
2780 
2781 #undef __FUNCT__
2782 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2783 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2784 {
2785   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2786   IS                iscol=a->col,isrow=a->row;
2787   PetscErrorCode    ierr;
2788   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2789   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2790   const MatScalar   *aa=a->a,*v;
2791   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2792   const PetscScalar *b;
2793 
2794   PetscFunctionBegin;
2795   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2796   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2797   t  = a->solve_work;
2798 
2799   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2800   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2801 
2802   /* forward solve the lower triangular */
2803   idx    = 4*(*r++);
2804   t[0] = b[idx];   t[1] = b[1+idx];
2805   t[2] = b[2+idx]; t[3] = b[3+idx];
2806   for (i=1; i<n; i++) {
2807     v     = aa + 16*ai[i];
2808     vi    = aj + ai[i];
2809     nz    = diag[i] - ai[i];
2810     idx   = 4*(*r++);
2811     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2812     while (nz--) {
2813       idx   = 4*(*vi++);
2814       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2815       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2816       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2817       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2818       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2819       v    += 16;
2820     }
2821     idx        = 4*i;
2822     t[idx]   = s1;t[1+idx] = s2;
2823     t[2+idx] = s3;t[3+idx] = s4;
2824   }
2825   /* backward solve the upper triangular */
2826   for (i=n-1; i>=0; i--){
2827     v    = aa + 16*diag[i] + 16;
2828     vi   = aj + diag[i] + 1;
2829     nz   = ai[i+1] - diag[i] - 1;
2830     idt  = 4*i;
2831     s1 = t[idt];  s2 = t[1+idt];
2832     s3 = t[2+idt];s4 = t[3+idt];
2833     while (nz--) {
2834       idx   = 4*(*vi++);
2835       x1    = t[idx];   x2 = t[1+idx];
2836       x3    = t[2+idx]; x4 = t[3+idx];
2837       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2838       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2839       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2840       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2841       v += 16;
2842     }
2843     idc      = 4*(*c--);
2844     v        = aa + 16*diag[i];
2845     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2846     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2847     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2848     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2849   }
2850 
2851   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2852   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2853   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2854   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2855   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2856   PetscFunctionReturn(0);
2857 }
2858 
2859 #undef __FUNCT__
2860 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2861 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
2862 {
2863   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2864   IS                iscol=a->col,isrow=a->row;
2865   PetscErrorCode    ierr;
2866   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2867   const PetscInt    *r,*c,*rout,*cout;
2868   const MatScalar   *aa=a->a,*v;
2869   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2870   const PetscScalar *b;
2871 
2872   PetscFunctionBegin;
2873   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2874   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2875   t  = a->solve_work;
2876 
2877   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2878   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2879 
2880   /* forward solve the lower triangular */
2881   idx    = 4*r[0];
2882   t[0] = b[idx];   t[1] = b[1+idx];
2883   t[2] = b[2+idx]; t[3] = b[3+idx];
2884   for (i=1; i<n; i++) {
2885     v     = aa + 16*ai[i];
2886     vi    = aj + ai[i];
2887     nz    = ai[i+1] - ai[i];
2888     idx   = 4*r[i];
2889     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2890     for(m=0;m<nz;m++){
2891       idx   = 4*vi[m];
2892       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2893       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2894       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2895       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2896       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2897       v    += 16;
2898     }
2899     idx        = 4*i;
2900     t[idx]   = s1;t[1+idx] = s2;
2901     t[2+idx] = s3;t[3+idx] = s4;
2902   }
2903   /* backward solve the upper triangular */
2904   for (i=n-1; i>=0; i--){
2905     v    = aa + 16*(adiag[i+1]+1);
2906     vi   = aj + adiag[i+1]+1;
2907     nz   = adiag[i] - adiag[i+1] - 1;
2908     idt  = 4*i;
2909     s1 = t[idt];  s2 = t[1+idt];
2910     s3 = t[2+idt];s4 = t[3+idt];
2911     for(m=0;m<nz;m++){
2912       idx   = 4*vi[m];
2913       x1    = t[idx];   x2 = t[1+idx];
2914       x3    = t[2+idx]; x4 = t[3+idx];
2915       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2916       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2917       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2918       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2919       v += 16;
2920     }
2921     idc      = 4*c[i];
2922     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2923     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2924     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2925     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2926   }
2927 
2928   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2929   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2930   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2931   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2932   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2933   PetscFunctionReturn(0);
2934 }
2935 
2936 #undef __FUNCT__
2937 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2938 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2939 {
2940   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2941   IS                iscol=a->col,isrow=a->row;
2942   PetscErrorCode    ierr;
2943   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2944   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2945   const MatScalar   *aa=a->a,*v;
2946   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2947   PetscScalar       *x;
2948   const PetscScalar *b;
2949 
2950   PetscFunctionBegin;
2951   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2952   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2953   t  = (MatScalar *)a->solve_work;
2954 
2955   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2956   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2957 
2958   /* forward solve the lower triangular */
2959   idx    = 4*(*r++);
2960   t[0] = (MatScalar)b[idx];
2961   t[1] = (MatScalar)b[1+idx];
2962   t[2] = (MatScalar)b[2+idx];
2963   t[3] = (MatScalar)b[3+idx];
2964   for (i=1; i<n; i++) {
2965     v     = aa + 16*ai[i];
2966     vi    = aj + ai[i];
2967     nz    = diag[i] - ai[i];
2968     idx   = 4*(*r++);
2969     s1 = (MatScalar)b[idx];
2970     s2 = (MatScalar)b[1+idx];
2971     s3 = (MatScalar)b[2+idx];
2972     s4 = (MatScalar)b[3+idx];
2973     while (nz--) {
2974       idx   = 4*(*vi++);
2975       x1  = t[idx];
2976       x2  = t[1+idx];
2977       x3  = t[2+idx];
2978       x4  = t[3+idx];
2979       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2980       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2981       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2982       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2983       v    += 16;
2984     }
2985     idx        = 4*i;
2986     t[idx]   = s1;
2987     t[1+idx] = s2;
2988     t[2+idx] = s3;
2989     t[3+idx] = s4;
2990   }
2991   /* backward solve the upper triangular */
2992   for (i=n-1; i>=0; i--){
2993     v    = aa + 16*diag[i] + 16;
2994     vi   = aj + diag[i] + 1;
2995     nz   = ai[i+1] - diag[i] - 1;
2996     idt  = 4*i;
2997     s1 = t[idt];
2998     s2 = t[1+idt];
2999     s3 = t[2+idt];
3000     s4 = t[3+idt];
3001     while (nz--) {
3002       idx   = 4*(*vi++);
3003       x1  = t[idx];
3004       x2  = t[1+idx];
3005       x3  = t[2+idx];
3006       x4  = t[3+idx];
3007       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3008       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3009       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3010       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3011       v += 16;
3012     }
3013     idc      = 4*(*c--);
3014     v        = aa + 16*diag[i];
3015     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3016     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3017     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3018     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3019     x[idc]   = (PetscScalar)t[idt];
3020     x[1+idc] = (PetscScalar)t[1+idt];
3021     x[2+idc] = (PetscScalar)t[2+idt];
3022     x[3+idc] = (PetscScalar)t[3+idt];
3023  }
3024 
3025   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3026   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3027   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3028   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3029   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3030   PetscFunctionReturn(0);
3031 }
3032 
3033 #if defined (PETSC_HAVE_SSE)
3034 
3035 #include PETSC_HAVE_SSE
3036 
3037 #undef __FUNCT__
3038 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3039 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3040 {
3041   /*
3042      Note: This code uses demotion of double
3043      to float when performing the mixed-mode computation.
3044      This may not be numerically reasonable for all applications.
3045   */
3046   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3047   IS             iscol=a->col,isrow=a->row;
3048   PetscErrorCode ierr;
3049   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3050   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3051   MatScalar      *aa=a->a,*v;
3052   PetscScalar    *x,*b,*t;
3053 
3054   /* Make space in temp stack for 16 Byte Aligned arrays */
3055   float           ssealignedspace[11],*tmps,*tmpx;
3056   unsigned long   offset;
3057 
3058   PetscFunctionBegin;
3059   SSE_SCOPE_BEGIN;
3060 
3061     offset = (unsigned long)ssealignedspace % 16;
3062     if (offset) offset = (16 - offset)/4;
3063     tmps = &ssealignedspace[offset];
3064     tmpx = &ssealignedspace[offset+4];
3065     PREFETCH_NTA(aa+16*ai[1]);
3066 
3067     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3068     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3069     t  = a->solve_work;
3070 
3071     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3072     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3073 
3074     /* forward solve the lower triangular */
3075     idx  = 4*(*r++);
3076     t[0] = b[idx];   t[1] = b[1+idx];
3077     t[2] = b[2+idx]; t[3] = b[3+idx];
3078     v    =  aa + 16*ai[1];
3079 
3080     for (i=1; i<n;) {
3081       PREFETCH_NTA(&v[8]);
3082       vi   =  aj      + ai[i];
3083       nz   =  diag[i] - ai[i];
3084       idx  =  4*(*r++);
3085 
3086       /* Demote sum from double to float */
3087       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3088       LOAD_PS(tmps,XMM7);
3089 
3090       while (nz--) {
3091         PREFETCH_NTA(&v[16]);
3092         idx = 4*(*vi++);
3093 
3094         /* Demote solution (so far) from double to float */
3095         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3096 
3097         /* 4x4 Matrix-Vector product with negative accumulation: */
3098         SSE_INLINE_BEGIN_2(tmpx,v)
3099           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3100 
3101           /* First Column */
3102           SSE_COPY_PS(XMM0,XMM6)
3103           SSE_SHUFFLE(XMM0,XMM0,0x00)
3104           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3105           SSE_SUB_PS(XMM7,XMM0)
3106 
3107           /* Second Column */
3108           SSE_COPY_PS(XMM1,XMM6)
3109           SSE_SHUFFLE(XMM1,XMM1,0x55)
3110           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3111           SSE_SUB_PS(XMM7,XMM1)
3112 
3113           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3114 
3115           /* Third Column */
3116           SSE_COPY_PS(XMM2,XMM6)
3117           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3118           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3119           SSE_SUB_PS(XMM7,XMM2)
3120 
3121           /* Fourth Column */
3122           SSE_COPY_PS(XMM3,XMM6)
3123           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3124           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3125           SSE_SUB_PS(XMM7,XMM3)
3126         SSE_INLINE_END_2
3127 
3128         v  += 16;
3129       }
3130       idx = 4*i;
3131       v   = aa + 16*ai[++i];
3132       PREFETCH_NTA(v);
3133       STORE_PS(tmps,XMM7);
3134 
3135       /* Promote result from float to double */
3136       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3137     }
3138     /* backward solve the upper triangular */
3139     idt  = 4*(n-1);
3140     ai16 = 16*diag[n-1];
3141     v    = aa + ai16 + 16;
3142     for (i=n-1; i>=0;){
3143       PREFETCH_NTA(&v[8]);
3144       vi = aj + diag[i] + 1;
3145       nz = ai[i+1] - diag[i] - 1;
3146 
3147       /* Demote accumulator from double to float */
3148       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3149       LOAD_PS(tmps,XMM7);
3150 
3151       while (nz--) {
3152         PREFETCH_NTA(&v[16]);
3153         idx = 4*(*vi++);
3154 
3155         /* Demote solution (so far) from double to float */
3156         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3157 
3158         /* 4x4 Matrix-Vector Product with negative accumulation: */
3159         SSE_INLINE_BEGIN_2(tmpx,v)
3160           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3161 
3162           /* First Column */
3163           SSE_COPY_PS(XMM0,XMM6)
3164           SSE_SHUFFLE(XMM0,XMM0,0x00)
3165           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3166           SSE_SUB_PS(XMM7,XMM0)
3167 
3168           /* Second Column */
3169           SSE_COPY_PS(XMM1,XMM6)
3170           SSE_SHUFFLE(XMM1,XMM1,0x55)
3171           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3172           SSE_SUB_PS(XMM7,XMM1)
3173 
3174           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3175 
3176           /* Third Column */
3177           SSE_COPY_PS(XMM2,XMM6)
3178           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3179           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3180           SSE_SUB_PS(XMM7,XMM2)
3181 
3182           /* Fourth Column */
3183           SSE_COPY_PS(XMM3,XMM6)
3184           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3185           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3186           SSE_SUB_PS(XMM7,XMM3)
3187         SSE_INLINE_END_2
3188         v  += 16;
3189       }
3190       v    = aa + ai16;
3191       ai16 = 16*diag[--i];
3192       PREFETCH_NTA(aa+ai16+16);
3193       /*
3194          Scale the result by the diagonal 4x4 block,
3195          which was inverted as part of the factorization
3196       */
3197       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3198         /* First Column */
3199         SSE_COPY_PS(XMM0,XMM7)
3200         SSE_SHUFFLE(XMM0,XMM0,0x00)
3201         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3202 
3203         /* Second Column */
3204         SSE_COPY_PS(XMM1,XMM7)
3205         SSE_SHUFFLE(XMM1,XMM1,0x55)
3206         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3207         SSE_ADD_PS(XMM0,XMM1)
3208 
3209         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3210 
3211         /* Third Column */
3212         SSE_COPY_PS(XMM2,XMM7)
3213         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3214         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3215         SSE_ADD_PS(XMM0,XMM2)
3216 
3217         /* Fourth Column */
3218         SSE_COPY_PS(XMM3,XMM7)
3219         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3220         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3221         SSE_ADD_PS(XMM0,XMM3)
3222 
3223         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3224       SSE_INLINE_END_3
3225 
3226       /* Promote solution from float to double */
3227       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3228 
3229       /* Apply reordering to t and stream into x.    */
3230       /* This way, x doesn't pollute the cache.      */
3231       /* Be careful with size: 2 doubles = 4 floats! */
3232       idc  = 4*(*c--);
3233       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3234         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3235         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3236         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3237         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3238         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3239         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3240       SSE_INLINE_END_2
3241       v    = aa + ai16 + 16;
3242       idt -= 4;
3243     }
3244 
3245     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3246     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3247     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3248     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3249     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3250   SSE_SCOPE_END;
3251   PetscFunctionReturn(0);
3252 }
3253 
3254 #endif
3255 
3256 
3257 /*
3258       Special case where the matrix was ILU(0) factored in the natural
3259    ordering. This eliminates the need for the column and row permutation.
3260 */
3261 #undef __FUNCT__
3262 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3263 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3264 {
3265   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3266   PetscInt          n=a->mbs;
3267   const PetscInt    *ai=a->i,*aj=a->j;
3268   PetscErrorCode    ierr;
3269   const PetscInt    *diag = a->diag;
3270   const MatScalar   *aa=a->a;
3271   PetscScalar       *x;
3272   const PetscScalar *b;
3273 
3274   PetscFunctionBegin;
3275   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3276   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3277 
3278 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3279   {
3280     static PetscScalar w[2000]; /* very BAD need to fix */
3281     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3282   }
3283 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3284   {
3285     static PetscScalar w[2000]; /* very BAD need to fix */
3286     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3287   }
3288 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3289   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3290 #else
3291   {
3292     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3293     const MatScalar *v;
3294     PetscInt        jdx,idt,idx,nz,i,ai16;
3295     const PetscInt  *vi;
3296 
3297   /* forward solve the lower triangular */
3298   idx    = 0;
3299   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3300   for (i=1; i<n; i++) {
3301     v     =  aa      + 16*ai[i];
3302     vi    =  aj      + ai[i];
3303     nz    =  diag[i] - ai[i];
3304     idx   +=  4;
3305     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3306     while (nz--) {
3307       jdx   = 4*(*vi++);
3308       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3309       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3310       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3311       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3312       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3313       v    += 16;
3314     }
3315     x[idx]   = s1;
3316     x[1+idx] = s2;
3317     x[2+idx] = s3;
3318     x[3+idx] = s4;
3319   }
3320   /* backward solve the upper triangular */
3321   idt = 4*(n-1);
3322   for (i=n-1; i>=0; i--){
3323     ai16 = 16*diag[i];
3324     v    = aa + ai16 + 16;
3325     vi   = aj + diag[i] + 1;
3326     nz   = ai[i+1] - diag[i] - 1;
3327     s1 = x[idt];  s2 = x[1+idt];
3328     s3 = x[2+idt];s4 = x[3+idt];
3329     while (nz--) {
3330       idx   = 4*(*vi++);
3331       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3332       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3333       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3334       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3335       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3336       v    += 16;
3337     }
3338     v        = aa + ai16;
3339     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3340     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3341     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3342     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3343     idt -= 4;
3344   }
3345   }
3346 #endif
3347 
3348   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3349   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3350   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3351   PetscFunctionReturn(0);
3352 }
3353 
3354 #undef __FUNCT__
3355 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3356 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3357 {
3358     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3359     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3360     PetscErrorCode    ierr;
3361     PetscInt          idx,jdx,idt;
3362     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3363     const MatScalar   *aa=a->a,*v;
3364     PetscScalar       *x;
3365     const PetscScalar *b;
3366     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3367 
3368     PetscFunctionBegin;
3369     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3370     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3371     /* forward solve the lower triangular */
3372     idx    = 0;
3373     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3374     for (i=1; i<n; i++) {
3375        v    = aa + bs2*ai[i];
3376        vi   = aj + ai[i];
3377        nz   = ai[i+1] - ai[i];
3378       idx   = bs*i;
3379        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3380       for(k=0;k<nz;k++) {
3381           jdx   = bs*vi[k];
3382           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3383           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3384           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3385           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3386 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3387 
3388           v   +=  bs2;
3389         }
3390 
3391        x[idx]   = s1;
3392        x[1+idx] = s2;
3393        x[2+idx] = s3;
3394        x[3+idx] = s4;
3395     }
3396 
3397    /* backward solve the upper triangular */
3398   for (i=n-1; i>=0; i--){
3399     v   = aa + bs2*(adiag[i+1]+1);
3400      vi  = aj + adiag[i+1]+1;
3401      nz  = adiag[i] - adiag[i+1]-1;
3402      idt = bs*i;
3403      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3404 
3405     for(k=0;k<nz;k++){
3406       idx   = bs*vi[k];
3407        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3408        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3409        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3410        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3411        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3412 
3413         v   +=  bs2;
3414     }
3415     /* x = inv_diagonal*x */
3416    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3417    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3418    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3419    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3420 
3421   }
3422 
3423   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3424   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3425   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3426   PetscFunctionReturn(0);
3427 }
3428 
3429 #undef __FUNCT__
3430 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3431 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3432 {
3433   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3434   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3435   PetscErrorCode ierr;
3436   PetscInt       *diag = a->diag;
3437   MatScalar      *aa=a->a;
3438   PetscScalar    *x,*b;
3439 
3440   PetscFunctionBegin;
3441   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3442   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3443 
3444   {
3445     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3446     MatScalar  *v,*t=(MatScalar *)x;
3447     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3448 
3449     /* forward solve the lower triangular */
3450     idx  = 0;
3451     t[0] = (MatScalar)b[0];
3452     t[1] = (MatScalar)b[1];
3453     t[2] = (MatScalar)b[2];
3454     t[3] = (MatScalar)b[3];
3455     for (i=1; i<n; i++) {
3456       v     =  aa      + 16*ai[i];
3457       vi    =  aj      + ai[i];
3458       nz    =  diag[i] - ai[i];
3459       idx   +=  4;
3460       s1 = (MatScalar)b[idx];
3461       s2 = (MatScalar)b[1+idx];
3462       s3 = (MatScalar)b[2+idx];
3463       s4 = (MatScalar)b[3+idx];
3464       while (nz--) {
3465         jdx = 4*(*vi++);
3466         x1  = t[jdx];
3467         x2  = t[1+jdx];
3468         x3  = t[2+jdx];
3469         x4  = t[3+jdx];
3470         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3471         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3472         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3473         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3474         v    += 16;
3475       }
3476       t[idx]   = s1;
3477       t[1+idx] = s2;
3478       t[2+idx] = s3;
3479       t[3+idx] = s4;
3480     }
3481     /* backward solve the upper triangular */
3482     idt = 4*(n-1);
3483     for (i=n-1; i>=0; i--){
3484       ai16 = 16*diag[i];
3485       v    = aa + ai16 + 16;
3486       vi   = aj + diag[i] + 1;
3487       nz   = ai[i+1] - diag[i] - 1;
3488       s1   = t[idt];
3489       s2   = t[1+idt];
3490       s3   = t[2+idt];
3491       s4   = t[3+idt];
3492       while (nz--) {
3493         idx = 4*(*vi++);
3494         x1  = (MatScalar)x[idx];
3495         x2  = (MatScalar)x[1+idx];
3496         x3  = (MatScalar)x[2+idx];
3497         x4  = (MatScalar)x[3+idx];
3498         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3499         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3500         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3501         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3502         v    += 16;
3503       }
3504       v        = aa + ai16;
3505       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3506       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3507       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3508       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3509       idt -= 4;
3510     }
3511   }
3512 
3513   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3514   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3515   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3516   PetscFunctionReturn(0);
3517 }
3518 
3519 #if defined (PETSC_HAVE_SSE)
3520 
3521 #include PETSC_HAVE_SSE
3522 #undef __FUNCT__
3523 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3524 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
3525 {
3526   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3527   unsigned short *aj=(unsigned short *)a->j;
3528   PetscErrorCode ierr;
3529   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3530   MatScalar      *aa=a->a;
3531   PetscScalar    *x,*b;
3532 
3533   PetscFunctionBegin;
3534   SSE_SCOPE_BEGIN;
3535   /*
3536      Note: This code currently uses demotion of double
3537      to float when performing the mixed-mode computation.
3538      This may not be numerically reasonable for all applications.
3539   */
3540   PREFETCH_NTA(aa+16*ai[1]);
3541 
3542   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3543   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3544   {
3545     /* x will first be computed in single precision then promoted inplace to double */
3546     MatScalar      *v,*t=(MatScalar *)x;
3547     int            nz,i,idt,ai16;
3548     unsigned int   jdx,idx;
3549     unsigned short *vi;
3550     /* Forward solve the lower triangular factor. */
3551 
3552     /* First block is the identity. */
3553     idx  = 0;
3554     CONVERT_DOUBLE4_FLOAT4(t,b);
3555     v    =  aa + 16*((unsigned int)ai[1]);
3556 
3557     for (i=1; i<n;) {
3558       PREFETCH_NTA(&v[8]);
3559       vi   =  aj      + ai[i];
3560       nz   =  diag[i] - ai[i];
3561       idx +=  4;
3562 
3563       /* Demote RHS from double to float. */
3564       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3565       LOAD_PS(&t[idx],XMM7);
3566 
3567       while (nz--) {
3568         PREFETCH_NTA(&v[16]);
3569         jdx = 4*((unsigned int)(*vi++));
3570 
3571         /* 4x4 Matrix-Vector product with negative accumulation: */
3572         SSE_INLINE_BEGIN_2(&t[jdx],v)
3573           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3574 
3575           /* First Column */
3576           SSE_COPY_PS(XMM0,XMM6)
3577           SSE_SHUFFLE(XMM0,XMM0,0x00)
3578           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3579           SSE_SUB_PS(XMM7,XMM0)
3580 
3581           /* Second Column */
3582           SSE_COPY_PS(XMM1,XMM6)
3583           SSE_SHUFFLE(XMM1,XMM1,0x55)
3584           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3585           SSE_SUB_PS(XMM7,XMM1)
3586 
3587           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3588 
3589           /* Third Column */
3590           SSE_COPY_PS(XMM2,XMM6)
3591           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3592           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3593           SSE_SUB_PS(XMM7,XMM2)
3594 
3595           /* Fourth Column */
3596           SSE_COPY_PS(XMM3,XMM6)
3597           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3598           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3599           SSE_SUB_PS(XMM7,XMM3)
3600         SSE_INLINE_END_2
3601 
3602         v  += 16;
3603       }
3604       v    =  aa + 16*ai[++i];
3605       PREFETCH_NTA(v);
3606       STORE_PS(&t[idx],XMM7);
3607     }
3608 
3609     /* Backward solve the upper triangular factor.*/
3610 
3611     idt  = 4*(n-1);
3612     ai16 = 16*diag[n-1];
3613     v    = aa + ai16 + 16;
3614     for (i=n-1; i>=0;){
3615       PREFETCH_NTA(&v[8]);
3616       vi = aj + diag[i] + 1;
3617       nz = ai[i+1] - diag[i] - 1;
3618 
3619       LOAD_PS(&t[idt],XMM7);
3620 
3621       while (nz--) {
3622         PREFETCH_NTA(&v[16]);
3623         idx = 4*((unsigned int)(*vi++));
3624 
3625         /* 4x4 Matrix-Vector Product with negative accumulation: */
3626         SSE_INLINE_BEGIN_2(&t[idx],v)
3627           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3628 
3629           /* First Column */
3630           SSE_COPY_PS(XMM0,XMM6)
3631           SSE_SHUFFLE(XMM0,XMM0,0x00)
3632           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3633           SSE_SUB_PS(XMM7,XMM0)
3634 
3635           /* Second Column */
3636           SSE_COPY_PS(XMM1,XMM6)
3637           SSE_SHUFFLE(XMM1,XMM1,0x55)
3638           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3639           SSE_SUB_PS(XMM7,XMM1)
3640 
3641           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3642 
3643           /* Third Column */
3644           SSE_COPY_PS(XMM2,XMM6)
3645           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3646           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3647           SSE_SUB_PS(XMM7,XMM2)
3648 
3649           /* Fourth Column */
3650           SSE_COPY_PS(XMM3,XMM6)
3651           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3652           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3653           SSE_SUB_PS(XMM7,XMM3)
3654         SSE_INLINE_END_2
3655         v  += 16;
3656       }
3657       v    = aa + ai16;
3658       ai16 = 16*diag[--i];
3659       PREFETCH_NTA(aa+ai16+16);
3660       /*
3661          Scale the result by the diagonal 4x4 block,
3662          which was inverted as part of the factorization
3663       */
3664       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3665         /* First Column */
3666         SSE_COPY_PS(XMM0,XMM7)
3667         SSE_SHUFFLE(XMM0,XMM0,0x00)
3668         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3669 
3670         /* Second Column */
3671         SSE_COPY_PS(XMM1,XMM7)
3672         SSE_SHUFFLE(XMM1,XMM1,0x55)
3673         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3674         SSE_ADD_PS(XMM0,XMM1)
3675 
3676         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3677 
3678         /* Third Column */
3679         SSE_COPY_PS(XMM2,XMM7)
3680         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3681         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3682         SSE_ADD_PS(XMM0,XMM2)
3683 
3684         /* Fourth Column */
3685         SSE_COPY_PS(XMM3,XMM7)
3686         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3687         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3688         SSE_ADD_PS(XMM0,XMM3)
3689 
3690         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3691       SSE_INLINE_END_3
3692 
3693       v    = aa + ai16 + 16;
3694       idt -= 4;
3695     }
3696 
3697     /* Convert t from single precision back to double precision (inplace)*/
3698     idt = 4*(n-1);
3699     for (i=n-1;i>=0;i--) {
3700       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3701       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3702       PetscScalar *xtemp=&x[idt];
3703       MatScalar   *ttemp=&t[idt];
3704       xtemp[3] = (PetscScalar)ttemp[3];
3705       xtemp[2] = (PetscScalar)ttemp[2];
3706       xtemp[1] = (PetscScalar)ttemp[1];
3707       xtemp[0] = (PetscScalar)ttemp[0];
3708       idt -= 4;
3709     }
3710 
3711   } /* End of artificial scope. */
3712   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3714   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3715   SSE_SCOPE_END;
3716   PetscFunctionReturn(0);
3717 }
3718 
3719 #undef __FUNCT__
3720 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3721 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
3722 {
3723   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3724   int            *aj=a->j;
3725   PetscErrorCode ierr;
3726   int            *ai=a->i,n=a->mbs,*diag = a->diag;
3727   MatScalar      *aa=a->a;
3728   PetscScalar    *x,*b;
3729 
3730   PetscFunctionBegin;
3731   SSE_SCOPE_BEGIN;
3732   /*
3733      Note: This code currently uses demotion of double
3734      to float when performing the mixed-mode computation.
3735      This may not be numerically reasonable for all applications.
3736   */
3737   PREFETCH_NTA(aa+16*ai[1]);
3738 
3739   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3740   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3741   {
3742     /* x will first be computed in single precision then promoted inplace to double */
3743     MatScalar *v,*t=(MatScalar *)x;
3744     int       nz,i,idt,ai16;
3745     int       jdx,idx;
3746     int       *vi;
3747     /* Forward solve the lower triangular factor. */
3748 
3749     /* First block is the identity. */
3750     idx  = 0;
3751     CONVERT_DOUBLE4_FLOAT4(t,b);
3752     v    =  aa + 16*ai[1];
3753 
3754     for (i=1; i<n;) {
3755       PREFETCH_NTA(&v[8]);
3756       vi   =  aj      + ai[i];
3757       nz   =  diag[i] - ai[i];
3758       idx +=  4;
3759 
3760       /* Demote RHS from double to float. */
3761       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3762       LOAD_PS(&t[idx],XMM7);
3763 
3764       while (nz--) {
3765         PREFETCH_NTA(&v[16]);
3766         jdx = 4*(*vi++);
3767 /*          jdx = *vi++; */
3768 
3769         /* 4x4 Matrix-Vector product with negative accumulation: */
3770         SSE_INLINE_BEGIN_2(&t[jdx],v)
3771           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3772 
3773           /* First Column */
3774           SSE_COPY_PS(XMM0,XMM6)
3775           SSE_SHUFFLE(XMM0,XMM0,0x00)
3776           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3777           SSE_SUB_PS(XMM7,XMM0)
3778 
3779           /* Second Column */
3780           SSE_COPY_PS(XMM1,XMM6)
3781           SSE_SHUFFLE(XMM1,XMM1,0x55)
3782           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3783           SSE_SUB_PS(XMM7,XMM1)
3784 
3785           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3786 
3787           /* Third Column */
3788           SSE_COPY_PS(XMM2,XMM6)
3789           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3790           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3791           SSE_SUB_PS(XMM7,XMM2)
3792 
3793           /* Fourth Column */
3794           SSE_COPY_PS(XMM3,XMM6)
3795           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3796           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3797           SSE_SUB_PS(XMM7,XMM3)
3798         SSE_INLINE_END_2
3799 
3800         v  += 16;
3801       }
3802       v    =  aa + 16*ai[++i];
3803       PREFETCH_NTA(v);
3804       STORE_PS(&t[idx],XMM7);
3805     }
3806 
3807     /* Backward solve the upper triangular factor.*/
3808 
3809     idt  = 4*(n-1);
3810     ai16 = 16*diag[n-1];
3811     v    = aa + ai16 + 16;
3812     for (i=n-1; i>=0;){
3813       PREFETCH_NTA(&v[8]);
3814       vi = aj + diag[i] + 1;
3815       nz = ai[i+1] - diag[i] - 1;
3816 
3817       LOAD_PS(&t[idt],XMM7);
3818 
3819       while (nz--) {
3820         PREFETCH_NTA(&v[16]);
3821         idx = 4*(*vi++);
3822 /*          idx = *vi++; */
3823 
3824         /* 4x4 Matrix-Vector Product with negative accumulation: */
3825         SSE_INLINE_BEGIN_2(&t[idx],v)
3826           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3827 
3828           /* First Column */
3829           SSE_COPY_PS(XMM0,XMM6)
3830           SSE_SHUFFLE(XMM0,XMM0,0x00)
3831           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3832           SSE_SUB_PS(XMM7,XMM0)
3833 
3834           /* Second Column */
3835           SSE_COPY_PS(XMM1,XMM6)
3836           SSE_SHUFFLE(XMM1,XMM1,0x55)
3837           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3838           SSE_SUB_PS(XMM7,XMM1)
3839 
3840           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3841 
3842           /* Third Column */
3843           SSE_COPY_PS(XMM2,XMM6)
3844           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3845           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3846           SSE_SUB_PS(XMM7,XMM2)
3847 
3848           /* Fourth Column */
3849           SSE_COPY_PS(XMM3,XMM6)
3850           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3851           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3852           SSE_SUB_PS(XMM7,XMM3)
3853         SSE_INLINE_END_2
3854         v  += 16;
3855       }
3856       v    = aa + ai16;
3857       ai16 = 16*diag[--i];
3858       PREFETCH_NTA(aa+ai16+16);
3859       /*
3860          Scale the result by the diagonal 4x4 block,
3861          which was inverted as part of the factorization
3862       */
3863       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3864         /* First Column */
3865         SSE_COPY_PS(XMM0,XMM7)
3866         SSE_SHUFFLE(XMM0,XMM0,0x00)
3867         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3868 
3869         /* Second Column */
3870         SSE_COPY_PS(XMM1,XMM7)
3871         SSE_SHUFFLE(XMM1,XMM1,0x55)
3872         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3873         SSE_ADD_PS(XMM0,XMM1)
3874 
3875         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3876 
3877         /* Third Column */
3878         SSE_COPY_PS(XMM2,XMM7)
3879         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3880         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3881         SSE_ADD_PS(XMM0,XMM2)
3882 
3883         /* Fourth Column */
3884         SSE_COPY_PS(XMM3,XMM7)
3885         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3886         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3887         SSE_ADD_PS(XMM0,XMM3)
3888 
3889         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3890       SSE_INLINE_END_3
3891 
3892       v    = aa + ai16 + 16;
3893       idt -= 4;
3894     }
3895 
3896     /* Convert t from single precision back to double precision (inplace)*/
3897     idt = 4*(n-1);
3898     for (i=n-1;i>=0;i--) {
3899       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3900       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3901       PetscScalar *xtemp=&x[idt];
3902       MatScalar   *ttemp=&t[idt];
3903       xtemp[3] = (PetscScalar)ttemp[3];
3904       xtemp[2] = (PetscScalar)ttemp[2];
3905       xtemp[1] = (PetscScalar)ttemp[1];
3906       xtemp[0] = (PetscScalar)ttemp[0];
3907       idt -= 4;
3908     }
3909 
3910   } /* End of artificial scope. */
3911   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3912   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3913   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3914   SSE_SCOPE_END;
3915   PetscFunctionReturn(0);
3916 }
3917 
3918 #endif
3919 
3920 #undef __FUNCT__
3921 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3922 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3923 {
3924   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3925   IS                iscol=a->col,isrow=a->row;
3926   PetscErrorCode    ierr;
3927   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3928   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3929   const MatScalar   *aa=a->a,*v;
3930   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3931   const PetscScalar *b;
3932 
3933   PetscFunctionBegin;
3934   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3936   t  = a->solve_work;
3937 
3938   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3939   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3940 
3941   /* forward solve the lower triangular */
3942   idx    = 3*(*r++);
3943   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3944   for (i=1; i<n; i++) {
3945     v     = aa + 9*ai[i];
3946     vi    = aj + ai[i];
3947     nz    = diag[i] - ai[i];
3948     idx   = 3*(*r++);
3949     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3950     while (nz--) {
3951       idx   = 3*(*vi++);
3952       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3953       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3954       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3955       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3956       v += 9;
3957     }
3958     idx = 3*i;
3959     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3960   }
3961   /* backward solve the upper triangular */
3962   for (i=n-1; i>=0; i--){
3963     v    = aa + 9*diag[i] + 9;
3964     vi   = aj + diag[i] + 1;
3965     nz   = ai[i+1] - diag[i] - 1;
3966     idt  = 3*i;
3967     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3968     while (nz--) {
3969       idx   = 3*(*vi++);
3970       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3971       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3972       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3973       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3974       v += 9;
3975     }
3976     idc = 3*(*c--);
3977     v   = aa + 9*diag[i];
3978     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3979     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3980     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3981   }
3982   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3983   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3984   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3985   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3986   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3987   PetscFunctionReturn(0);
3988 }
3989 
3990 #undef __FUNCT__
3991 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3992 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
3993 {
3994   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3995   IS                iscol=a->col,isrow=a->row;
3996   PetscErrorCode    ierr;
3997   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3998   const PetscInt    *r,*c,*rout,*cout;
3999   const MatScalar   *aa=a->a,*v;
4000   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4001   const PetscScalar *b;
4002 
4003   PetscFunctionBegin;
4004   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4005   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4006   t  = a->solve_work;
4007 
4008   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4009   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4010 
4011   /* forward solve the lower triangular */
4012   idx    = 3*r[0];
4013   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4014   for (i=1; i<n; i++) {
4015     v     = aa + 9*ai[i];
4016     vi    = aj + ai[i];
4017     nz    = ai[i+1] - ai[i];
4018     idx   = 3*r[i];
4019     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4020     for(m=0;m<nz;m++){
4021       idx   = 3*vi[m];
4022       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4023       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4024       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4025       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4026       v += 9;
4027     }
4028     idx = 3*i;
4029     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4030   }
4031   /* backward solve the upper triangular */
4032   for (i=n-1; i>=0; i--){
4033     v    = aa + 9*(adiag[i+1]+1);
4034     vi   = aj + adiag[i+1]+1;
4035     nz   = adiag[i] - adiag[i+1] - 1;
4036     idt  = 3*i;
4037     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4038     for(m=0;m<nz;m++){
4039       idx   = 3*vi[m];
4040       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4041       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4042       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4043       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4044       v += 9;
4045     }
4046     idc = 3*c[i];
4047     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4048     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4049     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4050   }
4051   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4052   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4053   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4054   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4055   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4056   PetscFunctionReturn(0);
4057 }
4058 
4059 /*
4060       Special case where the matrix was ILU(0) factored in the natural
4061    ordering. This eliminates the need for the column and row permutation.
4062 */
4063 #undef __FUNCT__
4064 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4065 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4066 {
4067   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4068   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4069   PetscErrorCode    ierr;
4070   PetscInt          *diag = a->diag;
4071   const MatScalar   *aa=a->a,*v;
4072   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4073   const PetscScalar *b;
4074   PetscInt          jdx,idt,idx,nz,*vi,i;
4075 
4076   PetscFunctionBegin;
4077   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4078   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4079 
4080   /* forward solve the lower triangular */
4081   idx    = 0;
4082   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4083   for (i=1; i<n; i++) {
4084     v     =  aa      + 9*ai[i];
4085     vi    =  aj      + ai[i];
4086     nz    =  diag[i] - ai[i];
4087     idx   +=  3;
4088     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4089     while (nz--) {
4090       jdx   = 3*(*vi++);
4091       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4092       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4093       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4094       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4095       v    += 9;
4096     }
4097     x[idx]   = s1;
4098     x[1+idx] = s2;
4099     x[2+idx] = s3;
4100   }
4101   /* backward solve the upper triangular */
4102   for (i=n-1; i>=0; i--){
4103     v    = aa + 9*diag[i] + 9;
4104     vi   = aj + diag[i] + 1;
4105     nz   = ai[i+1] - diag[i] - 1;
4106     idt  = 3*i;
4107     s1 = x[idt];  s2 = x[1+idt];
4108     s3 = x[2+idt];
4109     while (nz--) {
4110       idx   = 3*(*vi++);
4111       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4112       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4113       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4114       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4115       v    += 9;
4116     }
4117     v        = aa +  9*diag[i];
4118     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4119     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4120     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4121   }
4122 
4123   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4124   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4125   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4126   PetscFunctionReturn(0);
4127 }
4128 
4129 #undef __FUNCT__
4130 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4131 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4132 {
4133     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4134     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4135     PetscErrorCode    ierr;
4136     PetscInt          idx,jdx,idt;
4137     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4138     const MatScalar   *aa=a->a,*v;
4139     PetscScalar       *x;
4140     const PetscScalar *b;
4141     PetscScalar        s1,s2,s3,x1,x2,x3;
4142 
4143     PetscFunctionBegin;
4144     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4145     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4146     /* forward solve the lower triangular */
4147     idx    = 0;
4148     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4149     for (i=1; i<n; i++) {
4150        v    = aa + bs2*ai[i];
4151        vi   = aj + ai[i];
4152        nz   = ai[i+1] - ai[i];
4153       idx   = bs*i;
4154        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4155       for(k=0;k<nz;k++){
4156          jdx   = bs*vi[k];
4157           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4158           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4159           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4160           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4161 
4162           v   +=  bs2;
4163         }
4164 
4165        x[idx]   = s1;
4166        x[1+idx] = s2;
4167        x[2+idx] = s3;
4168     }
4169 
4170    /* backward solve the upper triangular */
4171   for (i=n-1; i>=0; i--){
4172     v   = aa + bs2*(adiag[i+1]+1);
4173      vi  = aj + adiag[i+1]+1;
4174      nz  = adiag[i] - adiag[i+1]-1;
4175      idt = bs*i;
4176      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4177 
4178      for(k=0;k<nz;k++){
4179        idx   = bs*vi[k];
4180        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4181        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4182        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4183        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4184 
4185         v   +=  bs2;
4186     }
4187     /* x = inv_diagonal*x */
4188    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4189    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4190    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4191 
4192   }
4193 
4194   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4195   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4197   PetscFunctionReturn(0);
4198 }
4199 
4200 #undef __FUNCT__
4201 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4202 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4203 {
4204   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4205   IS                iscol=a->col,isrow=a->row;
4206   PetscErrorCode    ierr;
4207   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4208   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4209   const MatScalar   *aa=a->a,*v;
4210   PetscScalar       *x,s1,s2,x1,x2,*t;
4211   const PetscScalar *b;
4212 
4213   PetscFunctionBegin;
4214   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4215   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4216   t  = a->solve_work;
4217 
4218   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4219   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4220 
4221   /* forward solve the lower triangular */
4222   idx    = 2*(*r++);
4223   t[0] = b[idx]; t[1] = b[1+idx];
4224   for (i=1; i<n; i++) {
4225     v     = aa + 4*ai[i];
4226     vi    = aj + ai[i];
4227     nz    = diag[i] - ai[i];
4228     idx   = 2*(*r++);
4229     s1  = b[idx]; s2 = b[1+idx];
4230     while (nz--) {
4231       idx   = 2*(*vi++);
4232       x1    = t[idx]; x2 = t[1+idx];
4233       s1 -= v[0]*x1 + v[2]*x2;
4234       s2 -= v[1]*x1 + v[3]*x2;
4235       v += 4;
4236     }
4237     idx = 2*i;
4238     t[idx] = s1; t[1+idx] = s2;
4239   }
4240   /* backward solve the upper triangular */
4241   for (i=n-1; i>=0; i--){
4242     v    = aa + 4*diag[i] + 4;
4243     vi   = aj + diag[i] + 1;
4244     nz   = ai[i+1] - diag[i] - 1;
4245     idt  = 2*i;
4246     s1 = t[idt]; s2 = t[1+idt];
4247     while (nz--) {
4248       idx   = 2*(*vi++);
4249       x1    = t[idx]; x2 = t[1+idx];
4250       s1 -= v[0]*x1 + v[2]*x2;
4251       s2 -= v[1]*x1 + v[3]*x2;
4252       v += 4;
4253     }
4254     idc = 2*(*c--);
4255     v   = aa + 4*diag[i];
4256     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4257     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4258   }
4259   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4260   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4261   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4262   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4263   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4264   PetscFunctionReturn(0);
4265 }
4266 
4267 #undef __FUNCT__
4268 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4269 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4270 {
4271   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4272   IS                iscol=a->col,isrow=a->row;
4273   PetscErrorCode    ierr;
4274   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4275   const PetscInt    *r,*c,*rout,*cout;
4276   const MatScalar   *aa=a->a,*v;
4277   PetscScalar       *x,s1,s2,x1,x2,*t;
4278   const PetscScalar *b;
4279 
4280   PetscFunctionBegin;
4281   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4282   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4283   t  = a->solve_work;
4284 
4285   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4286   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4287 
4288   /* forward solve the lower triangular */
4289   idx    = 2*r[0];
4290   t[0] = b[idx]; t[1] = b[1+idx];
4291   for (i=1; i<n; i++) {
4292     v     = aa + 4*ai[i];
4293     vi    = aj + ai[i];
4294     nz    = ai[i+1] - ai[i];
4295     idx   = 2*r[i];
4296     s1  = b[idx]; s2 = b[1+idx];
4297     for(m=0;m<nz;m++){
4298       jdx   = 2*vi[m];
4299       x1    = t[jdx]; x2 = t[1+jdx];
4300       s1 -= v[0]*x1 + v[2]*x2;
4301       s2 -= v[1]*x1 + v[3]*x2;
4302       v += 4;
4303     }
4304     idx = 2*i;
4305     t[idx] = s1; t[1+idx] = s2;
4306   }
4307   /* backward solve the upper triangular */
4308   for (i=n-1; i>=0; i--){
4309     v    = aa + 4*(adiag[i+1]+1);
4310     vi   = aj + adiag[i+1]+1;
4311     nz   = adiag[i] - adiag[i+1] - 1;
4312     idt  = 2*i;
4313     s1 = t[idt]; s2 = t[1+idt];
4314     for(m=0;m<nz;m++){
4315       idx   = 2*vi[m];
4316       x1    = t[idx]; x2 = t[1+idx];
4317       s1 -= v[0]*x1 + v[2]*x2;
4318       s2 -= v[1]*x1 + v[3]*x2;
4319       v += 4;
4320     }
4321     idc = 2*c[i];
4322     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4323     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4324   }
4325   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4326   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4327   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4328   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4329   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4330   PetscFunctionReturn(0);
4331 }
4332 
4333 /*
4334       Special case where the matrix was ILU(0) factored in the natural
4335    ordering. This eliminates the need for the column and row permutation.
4336 */
4337 #undef __FUNCT__
4338 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4339 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4340 {
4341   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4342   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4343   PetscErrorCode    ierr;
4344   PetscInt          *diag = a->diag;
4345   const MatScalar   *aa=a->a,*v;
4346   PetscScalar       *x,s1,s2,x1,x2;
4347   const PetscScalar *b;
4348   PetscInt          jdx,idt,idx,nz,*vi,i;
4349 
4350   PetscFunctionBegin;
4351   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4352   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4353 
4354   /* forward solve the lower triangular */
4355   idx    = 0;
4356   x[0]   = b[0]; x[1] = b[1];
4357   for (i=1; i<n; i++) {
4358     v     =  aa      + 4*ai[i];
4359     vi    =  aj      + ai[i];
4360     nz    =  diag[i] - ai[i];
4361     idx   +=  2;
4362     s1  =  b[idx];s2 = b[1+idx];
4363     while (nz--) {
4364       jdx   = 2*(*vi++);
4365       x1    = x[jdx];x2 = x[1+jdx];
4366       s1 -= v[0]*x1 + v[2]*x2;
4367       s2 -= v[1]*x1 + v[3]*x2;
4368       v    += 4;
4369     }
4370     x[idx]   = s1;
4371     x[1+idx] = s2;
4372   }
4373   /* backward solve the upper triangular */
4374   for (i=n-1; i>=0; i--){
4375     v    = aa + 4*diag[i] + 4;
4376     vi   = aj + diag[i] + 1;
4377     nz   = ai[i+1] - diag[i] - 1;
4378     idt  = 2*i;
4379     s1 = x[idt];  s2 = x[1+idt];
4380     while (nz--) {
4381       idx   = 2*(*vi++);
4382       x1    = x[idx];   x2 = x[1+idx];
4383       s1 -= v[0]*x1 + v[2]*x2;
4384       s2 -= v[1]*x1 + v[3]*x2;
4385       v    += 4;
4386     }
4387     v        = aa +  4*diag[i];
4388     x[idt]   = v[0]*s1 + v[2]*s2;
4389     x[1+idt] = v[1]*s1 + v[3]*s2;
4390   }
4391 
4392   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4394   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4395   PetscFunctionReturn(0);
4396 }
4397 
4398 #undef __FUNCT__
4399 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4400 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4401 {
4402     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4403     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4404     PetscErrorCode    ierr;
4405     PetscInt          jdx;
4406     const MatScalar   *aa=a->a,*v;
4407     PetscScalar       *x,s1,s2,x1,x2;
4408     const PetscScalar *b;
4409 
4410     PetscFunctionBegin;
4411     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4412     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4413     /* forward solve the lower triangular */
4414     idx    = 0;
4415     x[0] = b[idx]; x[1] = b[1+idx];
4416     for (i=1; i<n; i++) {
4417         v   = aa + 4*ai[i];
4418        vi   = aj + ai[i];
4419        nz   = ai[i+1] - ai[i];
4420        idx  = 2*i;
4421        s1   = b[idx];s2 = b[1+idx];
4422       for(k=0;k<nz;k++){
4423          jdx   = 2*vi[k];
4424           x1    = x[jdx];x2 = x[1+jdx];
4425           s1   -= v[0]*x1 + v[2]*x2;
4426           s2   -= v[1]*x1 + v[3]*x2;
4427            v   +=  4;
4428         }
4429        x[idx]   = s1;
4430        x[1+idx] = s2;
4431     }
4432 
4433    /* backward solve the upper triangular */
4434   for (i=n-1; i>=0; i--){
4435      v   = aa + 4*(adiag[i+1]+1);
4436      vi  = aj + adiag[i+1]+1;
4437      nz  = adiag[i] - adiag[i+1]-1;
4438      idt = 2*i;
4439      s1 = x[idt];  s2 = x[1+idt];
4440      for(k=0;k<nz;k++){
4441       idx   = 2*vi[k];
4442        x1    = x[idx];   x2 = x[1+idx];
4443        s1 -= v[0]*x1 + v[2]*x2;
4444        s2 -= v[1]*x1 + v[3]*x2;
4445          v    += 4;
4446     }
4447     /* x = inv_diagonal*x */
4448    x[idt]   = v[0]*s1 + v[2]*s2;
4449    x[1+idt] = v[1]*s1 + v[3]*s2;
4450   }
4451 
4452   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4453   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4454   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4455   PetscFunctionReturn(0);
4456 }
4457 
4458 #undef __FUNCT__
4459 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4460 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4461 {
4462   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4463   IS             iscol=a->col,isrow=a->row;
4464   PetscErrorCode ierr;
4465   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4466   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4467   MatScalar      *aa=a->a,*v;
4468   PetscScalar    *x,*b,s1,*t;
4469 
4470   PetscFunctionBegin;
4471   if (!n) PetscFunctionReturn(0);
4472 
4473   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4475   t  = a->solve_work;
4476 
4477   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4478   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4479 
4480   /* forward solve the lower triangular */
4481   t[0] = b[*r++];
4482   for (i=1; i<n; i++) {
4483     v     = aa + ai[i];
4484     vi    = aj + ai[i];
4485     nz    = diag[i] - ai[i];
4486     s1  = b[*r++];
4487     while (nz--) {
4488       s1 -= (*v++)*t[*vi++];
4489     }
4490     t[i] = s1;
4491   }
4492   /* backward solve the upper triangular */
4493   for (i=n-1; i>=0; i--){
4494     v    = aa + diag[i] + 1;
4495     vi   = aj + diag[i] + 1;
4496     nz   = ai[i+1] - diag[i] - 1;
4497     s1 = t[i];
4498     while (nz--) {
4499       s1 -= (*v++)*t[*vi++];
4500     }
4501     x[*c--] = t[i] = aa[diag[i]]*s1;
4502   }
4503 
4504   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4505   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4506   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4507   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4508   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4509   PetscFunctionReturn(0);
4510 }
4511 /*
4512       Special case where the matrix was ILU(0) factored in the natural
4513    ordering. This eliminates the need for the column and row permutation.
4514 */
4515 #undef __FUNCT__
4516 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4517 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
4518 {
4519   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4520   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4521   PetscErrorCode ierr;
4522   PetscInt       *diag = a->diag;
4523   MatScalar      *aa=a->a;
4524   PetscScalar    *x,*b;
4525   PetscScalar    s1,x1;
4526   MatScalar      *v;
4527   PetscInt       jdx,idt,idx,nz,*vi,i;
4528 
4529   PetscFunctionBegin;
4530   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4531   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4532 
4533   /* forward solve the lower triangular */
4534   idx    = 0;
4535   x[0]   = b[0];
4536   for (i=1; i<n; i++) {
4537     v     =  aa      + ai[i];
4538     vi    =  aj      + ai[i];
4539     nz    =  diag[i] - ai[i];
4540     idx   +=  1;
4541     s1  =  b[idx];
4542     while (nz--) {
4543       jdx   = *vi++;
4544       x1    = x[jdx];
4545       s1 -= v[0]*x1;
4546       v    += 1;
4547     }
4548     x[idx]   = s1;
4549   }
4550   /* backward solve the upper triangular */
4551   for (i=n-1; i>=0; i--){
4552     v    = aa + diag[i] + 1;
4553     vi   = aj + diag[i] + 1;
4554     nz   = ai[i+1] - diag[i] - 1;
4555     idt  = i;
4556     s1 = x[idt];
4557     while (nz--) {
4558       idx   = *vi++;
4559       x1    = x[idx];
4560       s1 -= v[0]*x1;
4561       v    += 1;
4562     }
4563     v        = aa +  diag[i];
4564     x[idt]   = v[0]*s1;
4565   }
4566   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4567   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
4569   PetscFunctionReturn(0);
4570 }
4571 
4572 /* ----------------------------------------------------------------*/
4573 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
4574 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4575 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
4576 
4577 #undef __FUNCT__
4578 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
4579 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
4580 {
4581   Mat            C=B;
4582   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
4583   IS             isrow = b->row,isicol = b->icol;
4584   PetscErrorCode ierr;
4585   const PetscInt *r,*ic,*ics;
4586   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
4587   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4588   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4589   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4590   MatScalar      *v_work;
4591   PetscTruth     col_identity,row_identity,both_identity;
4592 
4593   PetscFunctionBegin;
4594   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4595   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4596 
4597   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4598   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
4599   ics  = ic;
4600 
4601   /* generate work space needed by dense LU factorization */
4602   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
4603 
4604   for (i=0; i<n; i++){
4605     /* zero rtmp */
4606     /* L part */
4607     nz    = bi[i+1] - bi[i];
4608     bjtmp = bj + bi[i];
4609     for  (j=0; j<nz; j++){
4610       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4611     }
4612 
4613     /* U part */
4614     nz = bdiag[i] - bdiag[i+1];
4615     bjtmp = bj + bdiag[i+1]+1;
4616     for  (j=0; j<nz; j++){
4617       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4618     }
4619 
4620     /* load in initial (unfactored row) */
4621     nz    = ai[r[i]+1] - ai[r[i]];
4622     ajtmp = aj + ai[r[i]];
4623     v     = aa + bs2*ai[r[i]];
4624     for (j=0; j<nz; j++) {
4625       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
4626     }
4627 
4628     /* elimination */
4629     bjtmp = bj + bi[i];
4630     nzL   = bi[i+1] - bi[i];
4631     for(k=0;k < nzL;k++) {
4632       row = bjtmp[k];
4633       pc = rtmp + bs2*row;
4634       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4635       if (flg) {
4636         pv         = b->a + bs2*bdiag[row];
4637         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
4638         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
4639         pv         = b->a + bs2*(bdiag[row+1]+1);
4640         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
4641         for (j=0; j<nz; j++) {
4642           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4643         }
4644         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
4645       }
4646     }
4647 
4648     /* finished row so stick it into b->a */
4649     /* L part */
4650     pv   = b->a + bs2*bi[i] ;
4651     pj   = b->j + bi[i] ;
4652     nz   = bi[i+1] - bi[i];
4653     for (j=0; j<nz; j++) {
4654       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4655     }
4656 
4657     /* Mark diagonal and invert diagonal for simplier triangular solves */
4658     pv  = b->a + bs2*bdiag[i];
4659     pj  = b->j + bdiag[i];
4660     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4661     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4662     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
4663 
4664     /* U part */
4665     pv = b->a + bs2*(bdiag[i+1]+1);
4666     pj = b->j + bdiag[i+1]+1;
4667     nz = bdiag[i] - bdiag[i+1] - 1;
4668     for (j=0; j<nz; j++){
4669       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4670     }
4671   }
4672 
4673   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4674   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
4675   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4676   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4677 
4678   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4679   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
4680   both_identity = (PetscTruth) (row_identity && col_identity);
4681   if (both_identity){
4682     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4683   } else {
4684     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
4685   }
4686   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct;
4687 
4688   C->assembled = PETSC_TRUE;
4689   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
4690   PetscFunctionReturn(0);
4691 }
4692 
4693 /*
4694    ilu(0) with natural ordering under new data structure.
4695    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
4696    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
4697 */
4698 
4699 #undef __FUNCT__
4700 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
4701 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4702 {
4703 
4704   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4705   PetscErrorCode     ierr;
4706   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
4707   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
4708 
4709   PetscFunctionBegin;
4710   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
4711   b    = (Mat_SeqBAIJ*)(fact)->data;
4712 
4713   /* allocate matrix arrays for new data structure */
4714   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
4715   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
4716   b->singlemalloc = PETSC_TRUE;
4717   if (!b->diag){
4718     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
4719     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
4720   }
4721   bdiag = b->diag;
4722 
4723   if (n > 0) {
4724     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
4725   }
4726 
4727   /* set bi and bj with new data structure */
4728   bi = b->i;
4729   bj = b->j;
4730 
4731   /* L part */
4732   bi[0] = 0;
4733   for (i=0; i<n; i++){
4734     nz = adiag[i] - ai[i];
4735     bi[i+1] = bi[i] + nz;
4736     aj = a->j + ai[i];
4737     for (j=0; j<nz; j++){
4738       *bj = aj[j]; bj++;
4739     }
4740   }
4741 
4742   /* U part */
4743   bi_temp = bi[n];
4744   bdiag[n] = bi[n]-1;
4745   for (i=n-1; i>=0; i--){
4746     nz = ai[i+1] - adiag[i] - 1;
4747     bi_temp = bi_temp + nz + 1;
4748     aj = a->j + adiag[i] + 1;
4749     for (j=0; j<nz; j++){
4750       *bj = aj[j]; bj++;
4751     }
4752     /* diag[i] */
4753     *bj = i; bj++;
4754     bdiag[i] = bi_temp - 1;
4755   }
4756   PetscFunctionReturn(0);
4757 }
4758 
4759 #undef __FUNCT__
4760 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
4761 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4762 {
4763   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
4764   IS                 isicol;
4765   PetscErrorCode     ierr;
4766   const PetscInt     *r,*ic;
4767   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
4768   PetscInt           *bi,*cols,nnz,*cols_lvl;
4769   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
4770   PetscInt           i,levels,diagonal_fill;
4771   PetscTruth         col_identity,row_identity,both_identity;
4772   PetscReal          f;
4773   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
4774   PetscBT            lnkbt;
4775   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
4776   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
4777   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
4778   PetscTruth         missing;
4779   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
4780 
4781   PetscFunctionBegin;
4782   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
4783   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
4784   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
4785 
4786   f             = info->fill;
4787   levels        = (PetscInt)info->levels;
4788   diagonal_fill = (PetscInt)info->diagonal_fill;
4789   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4790 
4791   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4792   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4793   both_identity = (PetscTruth) (row_identity && col_identity);
4794 
4795   if (!levels && both_identity) {
4796     /* special case: ilu(0) with natural ordering */
4797     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4798     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
4799 
4800     fact->factor = MAT_FACTOR_ILU;
4801     (fact)->info.factor_mallocs    = 0;
4802     (fact)->info.fill_ratio_given  = info->fill;
4803     (fact)->info.fill_ratio_needed = 1.0;
4804     b                = (Mat_SeqBAIJ*)(fact)->data;
4805     b->row           = isrow;
4806     b->col           = iscol;
4807     b->icol          = isicol;
4808     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4809     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4810     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4811     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4812     PetscFunctionReturn(0);
4813   }
4814 
4815   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4816   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4817 
4818   /* get new row pointers */
4819   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
4820   bi[0] = 0;
4821   /* bdiag is location of diagonal in factor */
4822   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
4823   bdiag[0]  = 0;
4824 
4825   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
4826 
4827   /* create a linked list for storing column indices of the active row */
4828   nlnk = n + 1;
4829   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4830 
4831   /* initial FreeSpace size is f*(ai[n]+1) */
4832   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
4833   current_space = free_space;
4834   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
4835   current_space_lvl = free_space_lvl;
4836 
4837   for (i=0; i<n; i++) {
4838     nzi = 0;
4839     /* copy current row into linked list */
4840     nnz  = ai[r[i]+1] - ai[r[i]];
4841     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
4842     cols = aj + ai[r[i]];
4843     lnk[i] = -1; /* marker to indicate if diagonal exists */
4844     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
4845     nzi += nlnk;
4846 
4847     /* make sure diagonal entry is included */
4848     if (diagonal_fill && lnk[i] == -1) {
4849       fm = n;
4850       while (lnk[fm] < i) fm = lnk[fm];
4851       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
4852       lnk[fm]    = i;
4853       lnk_lvl[i] = 0;
4854       nzi++; dcount++;
4855     }
4856 
4857     /* add pivot rows into the active row */
4858     nzbd = 0;
4859     prow = lnk[n];
4860     while (prow < i) {
4861       nnz      = bdiag[prow];
4862       cols     = bj_ptr[prow] + nnz + 1;
4863       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
4864       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
4865       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
4866       nzi += nlnk;
4867       prow = lnk[prow];
4868       nzbd++;
4869     }
4870     bdiag[i] = nzbd;
4871     bi[i+1]  = bi[i] + nzi;
4872 
4873     /* if free space is not available, make more free space */
4874     if (current_space->local_remaining<nzi) {
4875       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
4876       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
4877       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
4878       reallocs++;
4879     }
4880 
4881     /* copy data into free_space and free_space_lvl, then initialize lnk */
4882     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
4883     bj_ptr[i]    = current_space->array;
4884     bjlvl_ptr[i] = current_space_lvl->array;
4885 
4886     /* make sure the active row i has diagonal entry */
4887     if (*(bj_ptr[i]+bdiag[i]) != i) {
4888       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
4889     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
4890     }
4891 
4892     current_space->array           += nzi;
4893     current_space->local_used      += nzi;
4894     current_space->local_remaining -= nzi;
4895     current_space_lvl->array           += nzi;
4896     current_space_lvl->local_used      += nzi;
4897     current_space_lvl->local_remaining -= nzi;
4898   }
4899 
4900   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4901   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4902 
4903   /* destroy list of free space and other temporary arrays */
4904   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
4905 
4906   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
4907   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
4908 
4909   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
4910   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
4911   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
4912 
4913 #if defined(PETSC_USE_INFO)
4914   {
4915     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
4916     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
4917     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4918     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
4919     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4920     if (diagonal_fill) {
4921       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
4922     }
4923   }
4924 #endif
4925 
4926   /* put together the new matrix */
4927   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4928   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4929   b = (Mat_SeqBAIJ*)(fact)->data;
4930   b->free_a       = PETSC_TRUE;
4931   b->free_ij      = PETSC_TRUE;
4932   b->singlemalloc = PETSC_FALSE;
4933   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
4934   b->j          = bj;
4935   b->i          = bi;
4936   b->diag       = bdiag;
4937   b->free_diag  = PETSC_TRUE;
4938   b->ilen       = 0;
4939   b->imax       = 0;
4940   b->row        = isrow;
4941   b->col        = iscol;
4942   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4943   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4944   b->icol       = isicol;
4945   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4946   /* In b structure:  Free imax, ilen, old a, old j.
4947      Allocate bdiag, solve_work, new a, new j */
4948   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
4949   b->maxnz = b->nz = bdiag[0]+1;
4950   fact->info.factor_mallocs    = reallocs;
4951   fact->info.fill_ratio_given  = f;
4952   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
4953   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
4954   PetscFunctionReturn(0);
4955 }
4956 
4957 
4958 /*
4959      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
4960    except that the data structure of Mat_SeqAIJ is slightly different.
4961    Not a good example of code reuse.
4962 */
4963 #undef __FUNCT__
4964 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
4965 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4966 {
4967   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
4968   IS             isicol;
4969   PetscErrorCode ierr;
4970   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
4971   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4972   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4973   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
4974   PetscTruth     col_identity,row_identity,both_identity,flg;
4975   PetscReal      f;
4976   PetscTruth     newdatastruct = PETSC_FALSE;
4977 
4978   PetscFunctionBegin;
4979   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
4980   if (newdatastruct){
4981     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4982     PetscFunctionReturn(0);
4983   }
4984 
4985   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
4986   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
4987 
4988   f             = info->fill;
4989   levels        = (PetscInt)info->levels;
4990   diagonal_fill = (PetscInt)info->diagonal_fill;
4991   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4992 
4993   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4994   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4995   both_identity = (PetscTruth) (row_identity && col_identity);
4996 
4997   if (!levels && both_identity) {  /* special case copy the nonzero structure */
4998     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
4999     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5000 
5001     fact->factor = MAT_FACTOR_ILU;
5002     b            = (Mat_SeqBAIJ*)fact->data;
5003     b->row       = isrow;
5004     b->col       = iscol;
5005     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5006     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5007     b->icol      = isicol;
5008     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5009     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5010     PetscFunctionReturn(0);
5011   }
5012 
5013   /* general case perform the symbolic factorization */
5014     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5015     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5016 
5017     /* get new row pointers */
5018     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5019     ainew[0] = 0;
5020     /* don't know how many column pointers are needed so estimate */
5021     jmax = (PetscInt)(f*ai[n] + 1);
5022     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5023     /* ajfill is level of fill for each fill entry */
5024     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5025     /* fill is a linked list of nonzeros in active row */
5026     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5027     /* im is level for each filled value */
5028     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5029     /* dloc is location of diagonal in factor */
5030     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5031     dloc[0]  = 0;
5032     for (prow=0; prow<n; prow++) {
5033 
5034       /* copy prow into linked list */
5035       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5036       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5037       xi         = aj + ai[r[prow]];
5038       fill[n]    = n;
5039       fill[prow] = -1; /* marker for diagonal entry */
5040       while (nz--) {
5041 	fm  = n;
5042 	idx = ic[*xi++];
5043 	do {
5044 	  m  = fm;
5045 	  fm = fill[m];
5046 	} while (fm < idx);
5047 	fill[m]   = idx;
5048 	fill[idx] = fm;
5049 	im[idx]   = 0;
5050       }
5051 
5052       /* make sure diagonal entry is included */
5053       if (diagonal_fill && fill[prow] == -1) {
5054 	fm = n;
5055 	while (fill[fm] < prow) fm = fill[fm];
5056 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5057 	fill[fm]   = prow;
5058 	im[prow]   = 0;
5059 	nzf++;
5060 	dcount++;
5061       }
5062 
5063       nzi = 0;
5064       row = fill[n];
5065       while (row < prow) {
5066 	incrlev = im[row] + 1;
5067 	nz      = dloc[row];
5068 	xi      = ajnew  + ainew[row] + nz + 1;
5069 	flev    = ajfill + ainew[row] + nz + 1;
5070 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5071 	fm      = row;
5072 	while (nnz-- > 0) {
5073 	  idx = *xi++;
5074 	  if (*flev + incrlev > levels) {
5075 	    flev++;
5076 	    continue;
5077 	  }
5078 	  do {
5079 	    m  = fm;
5080 	    fm = fill[m];
5081 	  } while (fm < idx);
5082 	  if (fm != idx) {
5083 	    im[idx]   = *flev + incrlev;
5084 	    fill[m]   = idx;
5085 	    fill[idx] = fm;
5086 	    fm        = idx;
5087 	    nzf++;
5088 	  } else {
5089 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5090 	  }
5091 	  flev++;
5092 	}
5093 	row = fill[row];
5094 	nzi++;
5095       }
5096       /* copy new filled row into permanent storage */
5097       ainew[prow+1] = ainew[prow] + nzf;
5098       if (ainew[prow+1] > jmax) {
5099 
5100 	/* estimate how much additional space we will need */
5101 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5102 	/* just double the memory each time */
5103 	PetscInt maxadd = jmax;
5104 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5105 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5106 	jmax += maxadd;
5107 
5108 	/* allocate a longer ajnew and ajfill */
5109 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5110 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5111 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5112 	ajnew = xitmp;
5113 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5114 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5115 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5116 	ajfill = xitmp;
5117 	reallocate++; /* count how many reallocations are needed */
5118       }
5119       xitmp       = ajnew + ainew[prow];
5120       flev        = ajfill + ainew[prow];
5121       dloc[prow]  = nzi;
5122       fm          = fill[n];
5123       while (nzf--) {
5124 	*xitmp++ = fm;
5125 	*flev++ = im[fm];
5126 	fm      = fill[fm];
5127       }
5128       /* make sure row has diagonal entry */
5129       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5130 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5131     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5132       }
5133     }
5134     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5135     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5136     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5137     ierr = PetscFree(fill);CHKERRQ(ierr);
5138     ierr = PetscFree(im);CHKERRQ(ierr);
5139 
5140 #if defined(PETSC_USE_INFO)
5141     {
5142       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5143       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5144       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5145       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5146       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5147       if (diagonal_fill) {
5148 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5149       }
5150     }
5151 #endif
5152 
5153     /* put together the new matrix */
5154     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5155     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5156     b    = (Mat_SeqBAIJ*)fact->data;
5157     b->free_a       = PETSC_TRUE;
5158     b->free_ij      = PETSC_TRUE;
5159     b->singlemalloc = PETSC_FALSE;
5160     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5161     b->j          = ajnew;
5162     b->i          = ainew;
5163     for (i=0; i<n; i++) dloc[i] += ainew[i];
5164     b->diag       = dloc;
5165     b->free_diag  = PETSC_TRUE;
5166     b->ilen       = 0;
5167     b->imax       = 0;
5168     b->row        = isrow;
5169     b->col        = iscol;
5170     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5171     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5172     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5173     b->icol       = isicol;
5174     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5175     /* In b structure:  Free imax, ilen, old a, old j.
5176        Allocate dloc, solve_work, new a, new j */
5177     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5178     b->maxnz          = b->nz = ainew[n];
5179 
5180     fact->info.factor_mallocs    = reallocate;
5181     fact->info.fill_ratio_given  = f;
5182     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5183 
5184   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5185   PetscFunctionReturn(0);
5186 }
5187 
5188 #undef __FUNCT__
5189 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5190 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5191 {
5192   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5193   /* int i,*AJ=a->j,nz=a->nz; */
5194   PetscFunctionBegin;
5195   /* Undo Column scaling */
5196 /*    while (nz--) { */
5197 /*      AJ[i] = AJ[i]/4; */
5198 /*    } */
5199   /* This should really invoke a push/pop logic, but we don't have that yet. */
5200   A->ops->setunfactored = PETSC_NULL;
5201   PetscFunctionReturn(0);
5202 }
5203 
5204 #undef __FUNCT__
5205 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5206 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5207 {
5208   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5209   PetscInt       *AJ=a->j,nz=a->nz;
5210   unsigned short *aj=(unsigned short *)AJ;
5211   PetscFunctionBegin;
5212   /* Is this really necessary? */
5213   while (nz--) {
5214     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5215   }
5216   A->ops->setunfactored = PETSC_NULL;
5217   PetscFunctionReturn(0);
5218 }
5219 
5220 
5221